[BugFix][Optimization] Replace silent failures with catchable exceptions and informative error messages (#6533)

* init * init * fix format * add * add files * add ut * fix some * add ut * add more * add * fix pre-commit * fix pre-commit * fix cover * skip long seq * add * add * fix * remove not need * fix set attr * fix comments * fix comments * fix failed tests --------- Co-authored-by: gongweibao <gognweibao@baidu.com>
2026-04-23 00:17:25 +08:00 · 2026-03-16 21:32:43 +08:00
parent d113397b09
commit a6351dea0b
61 changed files with 1595 additions and 171 deletions
@@ -41,6 +41,8 @@
 #include <sys/wait.h>
 #include <unistd.h>
 #endif
+#include <stdexcept>
+#include <string>
 #include <vector>

 #ifdef PADDLE_WITH_HIP
@@ -52,16 +54,14 @@ namespace cub = hipcub;
 #define GPU(str) cuda##str
 #endif

-#define checkCudaErrors(call)             \
-  do {                                    \
-    GPU(Error_t) err = call;              \
-    if (err != GPU(Success)) {            \
-      printf("CUDA error at %s %d: %s\n", \
-             __FILE__,                    \
-             __LINE__,                    \
-             GPU(GetErrorString)(err));   \
-      exit(EXIT_FAILURE);                 \
-    }                                     \
+#define checkCudaErrors(call)                                             \
+  do {                                                                    \
+    GPU(Error_t) err = call;                                              \
+    if (err != GPU(Success)) {                                            \
+      throw std::runtime_error(std::string("CUDA error at ") + __FILE__ + \
+                               ":" + std::to_string(__LINE__) + " '" +    \
+                               GPU(GetErrorString)(err) + "'");           \
+    }                                                                     \
  } while (0)

 typedef struct shmStruct_st {
@@ -63,8 +63,8 @@ void decode_alltoall_transpose(paddle::Tensor& inp,
  auto hidden_size = inp.shape()[1];
  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
  if (reg_buffer) {
-    cudaMemcpyAsync(
-        reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream);
+    CUDACHECK(cudaMemcpyAsync(
+        reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream));
  } else {
    reg_buffer = inp.data();
  }
@@ -124,8 +124,8 @@ void all_reduce(paddle::Tensor& inp,
  auto input_size = inp.numel() * phi::SizeOf(inp.dtype());
  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
  if (reg_buffer) {
-    cudaMemcpyAsync(
-        reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream);
+    CUDACHECK(cudaMemcpyAsync(
+        reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream));
  } else {
    reg_buffer = inp.data();
  }
@@ -22,19 +22,19 @@
 #include <iostream>
 #include <limits>
 #include <map>
+#include <stdexcept>
+#include <string>
 #include <unordered_map>
 #include <vector>

-#define CUDACHECK(cmd)                          \
-  do {                                          \
-    cudaError_t e = cmd;                        \
-    if (e != cudaSuccess) {                     \
-      printf("Failed: Cuda error %s:%d '%s'\n", \
-             __FILE__,                          \
-             __LINE__,                          \
-             cudaGetErrorString(e));            \
-      exit(EXIT_FAILURE);                       \
-    }                                           \
+#define CUDACHECK(cmd)                                                    \
+  do {                                                                    \
+    cudaError_t e = cmd;                                                  \
+    if (e != cudaSuccess) {                                               \
+      throw std::runtime_error(std::string("CUDA error at ") + __FILE__ + \
+                               ":" + std::to_string(__LINE__) + " '" +    \
+                               cudaGetErrorString(e) + "'");              \
+    }                                                                     \
  } while (0)

 namespace paddle {
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cstring>
 #include "cuda_multiprocess.h"
 #include "helper.h"

@@ -40,9 +41,9 @@ std::vector<paddle::Tensor> GetDataPtrIpc(const paddle::Tensor &tmp_input,
  volatile shmStruct *shm = NULL;
  sharedMemoryInfo info;
  if (sharedMemoryOpen2(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
-    printf("Func GetDataPtrIpc. Shm_name: %s\n", shm_name.c_str());
-    exit(EXIT_FAILURE);
+    throw std::runtime_error(
+        "Failed to open shared memory slab in GetDataPtrIpc, shm_name: " +
+        shm_name + ", errno: " + std::string(strerror(errno)));
  }
  shm = (volatile shmStruct *)info.addr;
  void *ptr = nullptr;
@@ -14,6 +14,8 @@
 // limitations under the License.
 #include <nvml.h>
 #include <iostream>
+#include <stdexcept>
+#include <string>
 #include "fstream"
 #include "helper.h"
 #include "iomanip"
@@ -136,7 +138,9 @@ void sent_key_value_by_remote_ptr(
 #endif
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
-      printf("CUDA Error: %s\n", cudaGetErrorString(err));
+      throw std::runtime_error(
+          std::string("CUDA Error in IPC KV cache transfer: ") +
+          cudaGetErrorString(err));
    }
 #ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT
    cudaDeviceSynchronize();
@@ -325,8 +329,11 @@ void SentKeyValueByRemotePtr(const paddle::Tensor& local_key_tensor,
          reinterpret_cast<dataT*>((void*)remote_value_ptr),
          cuda_stream);
    }
+    default: {
+      PD_THROW("Unsupported dtype for IPC KV cache transfer: ",
+               local_key_tensor.type());
+    }
  }
-  // using dataT=std::remove_pointer<decltype(local_block_ids_ptr)>;
 }

 void SentKeyValueByRemotePtrBlockSync(const paddle::Tensor& local_key_tensor,
@@ -90,13 +90,14 @@ struct AttentionKernelTraits {
  static constexpr bool USE_TMA_LOAD_KV = USE_TMA_LOAD_KV_;
  static constexpr int GROUP_SIZE = GROUP_SIZE_;
  static constexpr int BLOCK_SHAPE_Q = BLOCK_SHAPE_Q_;
-  static_assert(BLOCK_SHAPE_Q % 64 == 0);
+  static_assert(BLOCK_SHAPE_Q % 64 == 0,
+                "BLOCK_SHAPE_Q must be a multiple of 64");
  static constexpr int BLOCK_SHAPE_KV = BLOCK_SHAPE_KV_;
  static constexpr int HEAD_DIM_QK = HEAD_DIM_QK_;
  static constexpr int HEAD_DIM_VO = HEAD_DIM_VO_;
  static constexpr int NUM_PER_STAGE = BLOCK_SHAPE_KV * HEAD_DIM_QK;
-  static_assert(HEAD_DIM_QK % 32 == 0);
-  static_assert(HEAD_DIM_VO % 32 == 0);
+  static_assert(HEAD_DIM_QK % 32 == 0, "HEAD_DIM_QK must be a multiple of 32");
+  static_assert(HEAD_DIM_VO % 32 == 0, "HEAD_DIM_VO must be a multiple of 32");

  static constexpr int NUM_WARPS = 12;
  static constexpr int NUM_THREADS = 384;
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cstring>
+
 #include "cuda_multiprocess.h"
 #include "paddle/extension.h"

@@ -53,9 +55,9 @@ void ReadDataIpc(const paddle::Tensor &tmp_input,
  volatile shmStruct *shm = NULL;
  sharedMemoryInfo info;
  if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
-    printf("Func ReadDataIpc. Shm_name: %s\n", shm_name.c_str());
-    exit(EXIT_FAILURE);
+    throw std::runtime_error(
+        "Failed to open shared memory slab in ReadDataIpc, shm_name: " +
+        shm_name + ", errno: " + std::string(strerror(errno)));
  }
  shm = (volatile shmStruct *)info.addr;
  void *ptr = nullptr;
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cstring>
 #include "cuda_multiprocess.h"
 #include "helper.h"

@@ -85,9 +86,10 @@ void set_data_ipc(const paddle::Tensor& tmp_input,
  sharedMemoryInfo info;
  volatile shmStruct* shm = NULL;
  if (sharedMemoryCreate(shm_name.c_str(), sizeof(*shm), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
-    printf("Func sharedMemoryCreate. Shm_name: %s\n", shm_name.c_str());
-    exit(EXIT_FAILURE);
+    throw std::runtime_error(
+        "Failed to create shared memory slab in sharedMemoryCreate, "
+        "shm_name: " +
+        shm_name + ", errno: " + std::string(strerror(errno)));
  }
  shm = (volatile shmStruct*)info.addr;
  memset((void*)shm, 0, sizeof(*shm));
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "helper.h"
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
@@ -22,6 +21,7 @@
 #include <sys/mman.h>
 #include <stdio.h>
 #include "cuda_multiprocess.h"
+#include "helper.h"
 #include "paddle/phi/core/tensor_meta.h"

 std::vector<paddle::Tensor> ShareExternalData(paddle::Tensor &input,
@@ -30,9 +30,9 @@ std::vector<paddle::Tensor> ShareExternalData(paddle::Tensor &input,
  volatile shmStruct *shm = NULL;
  sharedMemoryInfo info;
  if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
-    printf("Func ShareExternalData. Shm_name: %s\n", shm_name.c_str());
-    exit(EXIT_FAILURE);
+    throw std::runtime_error(
+        "Failed to open shared memory slab in ShareExternalData, shm_name: " +
+        shm_name + ", errno: " + std::string(strerror(errno)));
  }
  shm = (volatile shmStruct *)info.addr;
  void *ptr = nullptr;
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "cuda_multiprocess.h"
 #include "helper.h"
 #include "paddle/extension.h"

@@ -47,24 +48,24 @@ void SwapCacheImpl(const paddle::Tensor& cache_gpu,   // gpu
    auto* cache_gpu_ptr_now = cache_gpu_ptr + gpu_block_id * cache_stride;
    auto* cache_cpu_ptr_now = cache_cpu_ptr + cpu_block_id * cache_stride;
    if (mode == 0) {  // copy from device to host
-      cudaMemcpyAsync(cache_cpu_ptr_now,
-                      cache_gpu_ptr_now,
-                      cache_stride * sizeof(DataType_),
-                      cudaMemcpyDeviceToHost,
-                      stream);
+      checkCudaErrors(cudaMemcpyAsync(cache_cpu_ptr_now,
+                                      cache_gpu_ptr_now,
+                                      cache_stride * sizeof(DataType_),
+                                      cudaMemcpyDeviceToHost,
+                                      stream));
      // cudaMemcpy(cache_dst_ptr_now, cache_src_ptr_now, cache_stride *
      // sizeof(DataType_), cudaMemcpyDeviceToHost);
    } else {  // copy from host to device
-      cudaMemcpyAsync(cache_gpu_ptr_now,
-                      cache_cpu_ptr_now,
-                      cache_stride * sizeof(DataType_),
-                      cudaMemcpyHostToDevice,
-                      stream);
+      checkCudaErrors(cudaMemcpyAsync(cache_gpu_ptr_now,
+                                      cache_cpu_ptr_now,
+                                      cache_stride * sizeof(DataType_),
+                                      cudaMemcpyHostToDevice,
+                                      stream));
      // cudaMemcpy(cache_dst_ptr_now, cache_src_ptr_now, cache_stride *
      // sizeof(DataType_), cudaMemcpyHostToDevice);
    }
  }
-  cudaStreamSynchronize(stream);
+  checkCudaErrors(cudaStreamSynchronize(stream));
 }

 void SwapCache(const paddle::Tensor& cache_gpu,  // gpu
@@ -74,7 +75,7 @@ void SwapCache(const paddle::Tensor& cache_gpu,  // gpu
               const std::vector<int64_t>& swap_block_ids_cpu,
               int rank,
               int mode) {
-  cudaSetDevice(rank);  // used for distributed launch
+  checkCudaErrors(cudaSetDevice(rank));  // used for distributed launch
  switch (cache_gpu.dtype()) {
    case paddle::DataType::BFLOAT16:
      return SwapCacheImpl<paddle::DataType::BFLOAT16>(cache_gpu,
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "cuda_multiprocess.h"
 #include "helper.h"
 #include "paddle/extension.h"

@@ -74,19 +75,19 @@ void SwapCacheImplAllLayers(
        auto* cache_cpu_ptr_now =
            cache_cpu_ptr + first_cpu_block_id * cache_stride;
        if (mode == 0) {  // copy from device to host
-          cudaMemcpyAsync(
+          checkCudaErrors(cudaMemcpyAsync(
              cache_cpu_ptr_now,
              cache_gpu_ptr_now,
              cache_stride * sizeof(DataType_) * consecutive_block_count,
              cudaMemcpyDeviceToHost,
-              stream);
+              stream));
        } else {  // copy from host to device
-          cudaMemcpyAsync(
+          checkCudaErrors(cudaMemcpyAsync(
              cache_gpu_ptr_now,
              cache_cpu_ptr_now,
              cache_stride * sizeof(DataType_) * consecutive_block_count,
              cudaMemcpyHostToDevice,
-              stream);
+              stream));
        }
        first_gpu_block_id = gpu_block_id;
        first_cpu_block_id = cpu_block_id;
@@ -100,22 +101,22 @@ void SwapCacheImplAllLayers(
    auto* cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
    auto* cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
    if (mode == 0) {  // copy from device to host
-      cudaMemcpyAsync(
+      checkCudaErrors(cudaMemcpyAsync(
          cache_cpu_ptr_now,
          cache_gpu_ptr_now,
          cache_stride * sizeof(DataType_) * consecutive_block_count,
          cudaMemcpyDeviceToHost,
-          stream);
+          stream));
    } else {  // copy from host to device
-      cudaMemcpyAsync(
+      checkCudaErrors(cudaMemcpyAsync(
          cache_gpu_ptr_now,
          cache_cpu_ptr_now,
          cache_stride * sizeof(DataType_) * consecutive_block_count,
          cudaMemcpyHostToDevice,
-          stream);
+          stream));
    }
  }
-  cudaStreamSynchronize(stream);
+  checkCudaErrors(cudaStreamSynchronize(stream));
 }

 void SwapCacheAllLayers(
@@ -126,7 +127,7 @@ void SwapCacheAllLayers(
    const std::vector<int64_t>& swap_block_ids_cpu,
    int rank,
    int mode) {
-  cudaSetDevice(rank);  // used for distributed launch
+  checkCudaErrors(cudaSetDevice(rank));  // used for distributed launch
  assert(cache_gpu_tensors.size() > 0 &&
         cache_gpu_tensors.size() == cache_cpu_ptrs.size());
  switch (cache_gpu_tensors[0].dtype()) {
@@ -734,10 +734,12 @@ void TuneCublasltGemm(const paddle::Tensor& K,
                      const bool is_test,
                      const bool is_read_from_file,
                      const std::string& path) {
-  assert(M_end >= M_start);
-  assert(M_start >= 1);
-  assert(K.dims().size() == 1 && N.dims().size() == 1);
-  assert(is_test != is_read_from_file);
+  assert(M_end >= M_start && "M_end must be >= M_start");
+  assert(M_start >= 1 && "M_start must be >= 1");
+  assert(K.dims().size() == 1 && N.dims().size() == 1 &&
+         "K and N must be 1D tensors");
+  assert(is_test != is_read_from_file &&
+         "Exactly one of is_test or is_read_from_file must be true");

  auto K_cpu = K.copy_to(paddle::CPUPlace(), false);
  auto N_cpu = N.copy_to(paddle::CPUPlace(), false);
@@ -746,7 +748,7 @@ void TuneCublasltGemm(const paddle::Tensor& K,

  int K_size = K.numel();
  int N_size = N.numel();
-  assert(K_size == N_size);
+  assert(K_size == N_size && "K and N must have the same number of elements");

  std::vector<int> mm;
  int m = M_start, step = 1;
@@ -796,7 +798,7 @@ void TuneCublasltGemm(const paddle::Tensor& K,
                 path);
      } else {
        // other dtype
-        throw std::runtime_error(dtype + "not currently supported");
+        throw std::runtime_error(dtype + " is not currently supported");
      }
    }
  }