[BugFix][Optimization] Replace silent failures with catchable exceptions and informative error messages (#6533)

* init

* init

* fix format

* add

* add files

* add ut

* fix some

* add ut

* add more

* add

* fix pre-commit

* fix pre-commit

* fix cover

* skip long seq

* add

* add

* fix

* remove not need

* fix set attr

* fix comments

* fix comments

* fix failed tests

---------

Co-authored-by: gongweibao <gognweibao@baidu.com>
This commit is contained in:
gongweibao
2026-03-16 21:32:43 +08:00
committed by GitHub
parent d113397b09
commit a6351dea0b
61 changed files with 1595 additions and 171 deletions
+10 -10
View File
@@ -41,6 +41,8 @@
#include <sys/wait.h>
#include <unistd.h>
#endif
#include <stdexcept>
#include <string>
#include <vector>
#ifdef PADDLE_WITH_HIP
@@ -52,16 +54,14 @@ namespace cub = hipcub;
#define GPU(str) cuda##str
#endif
#define checkCudaErrors(call) \
do { \
GPU(Error_t) err = call; \
if (err != GPU(Success)) { \
printf("CUDA error at %s %d: %s\n", \
__FILE__, \
__LINE__, \
GPU(GetErrorString)(err)); \
exit(EXIT_FAILURE); \
} \
#define checkCudaErrors(call) \
do { \
GPU(Error_t) err = call; \
if (err != GPU(Success)) { \
throw std::runtime_error(std::string("CUDA error at ") + __FILE__ + \
":" + std::to_string(__LINE__) + " '" + \
GPU(GetErrorString)(err) + "'"); \
} \
} while (0)
typedef struct shmStruct_st {
@@ -63,8 +63,8 @@ void decode_alltoall_transpose(paddle::Tensor& inp,
auto hidden_size = inp.shape()[1];
auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
if (reg_buffer) {
cudaMemcpyAsync(
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream);
CUDACHECK(cudaMemcpyAsync(
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream));
} else {
reg_buffer = inp.data();
}
@@ -124,8 +124,8 @@ void all_reduce(paddle::Tensor& inp,
auto input_size = inp.numel() * phi::SizeOf(inp.dtype());
auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
if (reg_buffer) {
cudaMemcpyAsync(
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream);
CUDACHECK(cudaMemcpyAsync(
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream));
} else {
reg_buffer = inp.data();
}
@@ -22,19 +22,19 @@
#include <iostream>
#include <limits>
#include <map>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>
#define CUDACHECK(cmd) \
do { \
cudaError_t e = cmd; \
if (e != cudaSuccess) { \
printf("Failed: Cuda error %s:%d '%s'\n", \
__FILE__, \
__LINE__, \
cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
#define CUDACHECK(cmd) \
do { \
cudaError_t e = cmd; \
if (e != cudaSuccess) { \
throw std::runtime_error(std::string("CUDA error at ") + __FILE__ + \
":" + std::to_string(__LINE__) + " '" + \
cudaGetErrorString(e) + "'"); \
} \
} while (0)
namespace paddle {
+4 -3
View File
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include "cuda_multiprocess.h"
#include "helper.h"
@@ -40,9 +41,9 @@ std::vector<paddle::Tensor> GetDataPtrIpc(const paddle::Tensor &tmp_input,
volatile shmStruct *shm = NULL;
sharedMemoryInfo info;
if (sharedMemoryOpen2(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
printf("Failed to create shared memory slab\n");
printf("Func GetDataPtrIpc. Shm_name: %s\n", shm_name.c_str());
exit(EXIT_FAILURE);
throw std::runtime_error(
"Failed to open shared memory slab in GetDataPtrIpc, shm_name: " +
shm_name + ", errno: " + std::string(strerror(errno)));
}
shm = (volatile shmStruct *)info.addr;
void *ptr = nullptr;
@@ -14,6 +14,8 @@
// limitations under the License.
#include <nvml.h>
#include <iostream>
#include <stdexcept>
#include <string>
#include "fstream"
#include "helper.h"
#include "iomanip"
@@ -136,7 +138,9 @@ void sent_key_value_by_remote_ptr(
#endif
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA Error: %s\n", cudaGetErrorString(err));
throw std::runtime_error(
std::string("CUDA Error in IPC KV cache transfer: ") +
cudaGetErrorString(err));
}
#ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT
cudaDeviceSynchronize();
@@ -325,8 +329,11 @@ void SentKeyValueByRemotePtr(const paddle::Tensor& local_key_tensor,
reinterpret_cast<dataT*>((void*)remote_value_ptr),
cuda_stream);
}
default: {
PD_THROW("Unsupported dtype for IPC KV cache transfer: ",
local_key_tensor.type());
}
}
// using dataT=std::remove_pointer<decltype(local_block_ids_ptr)>;
}
void SentKeyValueByRemotePtrBlockSync(const paddle::Tensor& local_key_tensor,
@@ -90,13 +90,14 @@ struct AttentionKernelTraits {
static constexpr bool USE_TMA_LOAD_KV = USE_TMA_LOAD_KV_;
static constexpr int GROUP_SIZE = GROUP_SIZE_;
static constexpr int BLOCK_SHAPE_Q = BLOCK_SHAPE_Q_;
static_assert(BLOCK_SHAPE_Q % 64 == 0);
static_assert(BLOCK_SHAPE_Q % 64 == 0,
"BLOCK_SHAPE_Q must be a multiple of 64");
static constexpr int BLOCK_SHAPE_KV = BLOCK_SHAPE_KV_;
static constexpr int HEAD_DIM_QK = HEAD_DIM_QK_;
static constexpr int HEAD_DIM_VO = HEAD_DIM_VO_;
static constexpr int NUM_PER_STAGE = BLOCK_SHAPE_KV * HEAD_DIM_QK;
static_assert(HEAD_DIM_QK % 32 == 0);
static_assert(HEAD_DIM_VO % 32 == 0);
static_assert(HEAD_DIM_QK % 32 == 0, "HEAD_DIM_QK must be a multiple of 32");
static_assert(HEAD_DIM_VO % 32 == 0, "HEAD_DIM_VO must be a multiple of 32");
static constexpr int NUM_WARPS = 12;
static constexpr int NUM_THREADS = 384;
+5 -3
View File
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include "cuda_multiprocess.h"
#include "paddle/extension.h"
@@ -53,9 +55,9 @@ void ReadDataIpc(const paddle::Tensor &tmp_input,
volatile shmStruct *shm = NULL;
sharedMemoryInfo info;
if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
printf("Failed to create shared memory slab\n");
printf("Func ReadDataIpc. Shm_name: %s\n", shm_name.c_str());
exit(EXIT_FAILURE);
throw std::runtime_error(
"Failed to open shared memory slab in ReadDataIpc, shm_name: " +
shm_name + ", errno: " + std::string(strerror(errno)));
}
shm = (volatile shmStruct *)info.addr;
void *ptr = nullptr;
+5 -3
View File
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include "cuda_multiprocess.h"
#include "helper.h"
@@ -85,9 +86,10 @@ void set_data_ipc(const paddle::Tensor& tmp_input,
sharedMemoryInfo info;
volatile shmStruct* shm = NULL;
if (sharedMemoryCreate(shm_name.c_str(), sizeof(*shm), &info) != 0) {
printf("Failed to create shared memory slab\n");
printf("Func sharedMemoryCreate. Shm_name: %s\n", shm_name.c_str());
exit(EXIT_FAILURE);
throw std::runtime_error(
"Failed to create shared memory slab in sharedMemoryCreate, "
"shm_name: " +
shm_name + ", errno: " + std::string(strerror(errno)));
}
shm = (volatile shmStruct*)info.addr;
memset((void*)shm, 0, sizeof(*shm));
+4 -4
View File
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
@@ -22,6 +21,7 @@
#include <sys/mman.h>
#include <stdio.h>
#include "cuda_multiprocess.h"
#include "helper.h"
#include "paddle/phi/core/tensor_meta.h"
std::vector<paddle::Tensor> ShareExternalData(paddle::Tensor &input,
@@ -30,9 +30,9 @@ std::vector<paddle::Tensor> ShareExternalData(paddle::Tensor &input,
volatile shmStruct *shm = NULL;
sharedMemoryInfo info;
if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
printf("Failed to create shared memory slab\n");
printf("Func ShareExternalData. Shm_name: %s\n", shm_name.c_str());
exit(EXIT_FAILURE);
throw std::runtime_error(
"Failed to open shared memory slab in ShareExternalData, shm_name: " +
shm_name + ", errno: " + std::string(strerror(errno)));
}
shm = (volatile shmStruct *)info.addr;
void *ptr = nullptr;
+13 -12
View File
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cuda_multiprocess.h"
#include "helper.h"
#include "paddle/extension.h"
@@ -47,24 +48,24 @@ void SwapCacheImpl(const paddle::Tensor& cache_gpu, // gpu
auto* cache_gpu_ptr_now = cache_gpu_ptr + gpu_block_id * cache_stride;
auto* cache_cpu_ptr_now = cache_cpu_ptr + cpu_block_id * cache_stride;
if (mode == 0) { // copy from device to host
cudaMemcpyAsync(cache_cpu_ptr_now,
cache_gpu_ptr_now,
cache_stride * sizeof(DataType_),
cudaMemcpyDeviceToHost,
stream);
checkCudaErrors(cudaMemcpyAsync(cache_cpu_ptr_now,
cache_gpu_ptr_now,
cache_stride * sizeof(DataType_),
cudaMemcpyDeviceToHost,
stream));
// cudaMemcpy(cache_dst_ptr_now, cache_src_ptr_now, cache_stride *
// sizeof(DataType_), cudaMemcpyDeviceToHost);
} else { // copy from host to device
cudaMemcpyAsync(cache_gpu_ptr_now,
cache_cpu_ptr_now,
cache_stride * sizeof(DataType_),
cudaMemcpyHostToDevice,
stream);
checkCudaErrors(cudaMemcpyAsync(cache_gpu_ptr_now,
cache_cpu_ptr_now,
cache_stride * sizeof(DataType_),
cudaMemcpyHostToDevice,
stream));
// cudaMemcpy(cache_dst_ptr_now, cache_src_ptr_now, cache_stride *
// sizeof(DataType_), cudaMemcpyHostToDevice);
}
}
cudaStreamSynchronize(stream);
checkCudaErrors(cudaStreamSynchronize(stream));
}
void SwapCache(const paddle::Tensor& cache_gpu, // gpu
@@ -74,7 +75,7 @@ void SwapCache(const paddle::Tensor& cache_gpu, // gpu
const std::vector<int64_t>& swap_block_ids_cpu,
int rank,
int mode) {
cudaSetDevice(rank); // used for distributed launch
checkCudaErrors(cudaSetDevice(rank)); // used for distributed launch
switch (cache_gpu.dtype()) {
case paddle::DataType::BFLOAT16:
return SwapCacheImpl<paddle::DataType::BFLOAT16>(cache_gpu,
+11 -10
View File
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cuda_multiprocess.h"
#include "helper.h"
#include "paddle/extension.h"
@@ -74,19 +75,19 @@ void SwapCacheImplAllLayers(
auto* cache_cpu_ptr_now =
cache_cpu_ptr + first_cpu_block_id * cache_stride;
if (mode == 0) { // copy from device to host
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_cpu_ptr_now,
cache_gpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyDeviceToHost,
stream);
stream));
} else { // copy from host to device
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_gpu_ptr_now,
cache_cpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyHostToDevice,
stream);
stream));
}
first_gpu_block_id = gpu_block_id;
first_cpu_block_id = cpu_block_id;
@@ -100,22 +101,22 @@ void SwapCacheImplAllLayers(
auto* cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
auto* cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
if (mode == 0) { // copy from device to host
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_cpu_ptr_now,
cache_gpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyDeviceToHost,
stream);
stream));
} else { // copy from host to device
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_gpu_ptr_now,
cache_cpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyHostToDevice,
stream);
stream));
}
}
cudaStreamSynchronize(stream);
checkCudaErrors(cudaStreamSynchronize(stream));
}
void SwapCacheAllLayers(
@@ -126,7 +127,7 @@ void SwapCacheAllLayers(
const std::vector<int64_t>& swap_block_ids_cpu,
int rank,
int mode) {
cudaSetDevice(rank); // used for distributed launch
checkCudaErrors(cudaSetDevice(rank)); // used for distributed launch
assert(cache_gpu_tensors.size() > 0 &&
cache_gpu_tensors.size() == cache_cpu_ptrs.size());
switch (cache_gpu_tensors[0].dtype()) {
+8 -6
View File
@@ -734,10 +734,12 @@ void TuneCublasltGemm(const paddle::Tensor& K,
const bool is_test,
const bool is_read_from_file,
const std::string& path) {
assert(M_end >= M_start);
assert(M_start >= 1);
assert(K.dims().size() == 1 && N.dims().size() == 1);
assert(is_test != is_read_from_file);
assert(M_end >= M_start && "M_end must be >= M_start");
assert(M_start >= 1 && "M_start must be >= 1");
assert(K.dims().size() == 1 && N.dims().size() == 1 &&
"K and N must be 1D tensors");
assert(is_test != is_read_from_file &&
"Exactly one of is_test or is_read_from_file must be true");
auto K_cpu = K.copy_to(paddle::CPUPlace(), false);
auto N_cpu = N.copy_to(paddle::CPUPlace(), false);
@@ -746,7 +748,7 @@ void TuneCublasltGemm(const paddle::Tensor& K,
int K_size = K.numel();
int N_size = N.numel();
assert(K_size == N_size);
assert(K_size == N_size && "K and N must have the same number of elements");
std::vector<int> mm;
int m = M_start, step = 1;
@@ -796,7 +798,7 @@ void TuneCublasltGemm(const paddle::Tensor& K,
path);
} else {
// other dtype
throw std::runtime_error(dtype + "not currently supported");
throw std::runtime_error(dtype + " is not currently supported");
}
}
}