[BugFix][Optimization] Replace silent failures with catchable exceptions and informative error messages (#6533)

* init * init * fix format * add * add files * add ut * fix some * add ut * add more * add * fix pre-commit * fix pre-commit * fix cover * skip long seq * add * add * fix * remove not need * fix set attr * fix comments * fix comments * fix failed tests --------- Co-authored-by: gongweibao <gognweibao@baidu.com>
2026-04-25 18:13:21 +08:00 · 2026-03-16 21:32:43 +08:00
parent d113397b09
commit a6351dea0b
61 changed files with 1595 additions and 171 deletions
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "cuda_multiprocess.h"
 #include "helper.h"
 #include "paddle/extension.h"

@@ -74,19 +75,19 @@ void SwapCacheImplAllLayers(
        auto* cache_cpu_ptr_now =
            cache_cpu_ptr + first_cpu_block_id * cache_stride;
        if (mode == 0) {  // copy from device to host
-          cudaMemcpyAsync(
+          checkCudaErrors(cudaMemcpyAsync(
              cache_cpu_ptr_now,
              cache_gpu_ptr_now,
              cache_stride * sizeof(DataType_) * consecutive_block_count,
              cudaMemcpyDeviceToHost,
-              stream);
+              stream));
        } else {  // copy from host to device
-          cudaMemcpyAsync(
+          checkCudaErrors(cudaMemcpyAsync(
              cache_gpu_ptr_now,
              cache_cpu_ptr_now,
              cache_stride * sizeof(DataType_) * consecutive_block_count,
              cudaMemcpyHostToDevice,
-              stream);
+              stream));
        }
        first_gpu_block_id = gpu_block_id;
        first_cpu_block_id = cpu_block_id;
@@ -100,22 +101,22 @@ void SwapCacheImplAllLayers(
    auto* cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
    auto* cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
    if (mode == 0) {  // copy from device to host
-      cudaMemcpyAsync(
+      checkCudaErrors(cudaMemcpyAsync(
          cache_cpu_ptr_now,
          cache_gpu_ptr_now,
          cache_stride * sizeof(DataType_) * consecutive_block_count,
          cudaMemcpyDeviceToHost,
-          stream);
+          stream));
    } else {  // copy from host to device
-      cudaMemcpyAsync(
+      checkCudaErrors(cudaMemcpyAsync(
          cache_gpu_ptr_now,
          cache_cpu_ptr_now,
          cache_stride * sizeof(DataType_) * consecutive_block_count,
          cudaMemcpyHostToDevice,
-          stream);
+          stream));
    }
  }
-  cudaStreamSynchronize(stream);
+  checkCudaErrors(cudaStreamSynchronize(stream));
 }

 void SwapCacheAllLayers(
@@ -126,7 +127,7 @@ void SwapCacheAllLayers(
    const std::vector<int64_t>& swap_block_ids_cpu,
    int rank,
    int mode) {
-  cudaSetDevice(rank);  // used for distributed launch
+  checkCudaErrors(cudaSetDevice(rank));  // used for distributed launch
  assert(cache_gpu_tensors.size() > 0 &&
         cache_gpu_tensors.size() == cache_cpu_ptrs.size());
  switch (cache_gpu_tensors[0].dtype()) {