[BugFix][Optimization] Replace silent failures with catchable exceptions and informative error messages (#6533)

* init

* init

* fix format

* add

* add files

* add ut

* fix some

* add ut

* add more

* add

* fix pre-commit

* fix pre-commit

* fix cover

* skip long seq

* add

* add

* fix

* remove not need

* fix set attr

* fix comments

* fix comments

* fix failed tests

---------

Co-authored-by: gongweibao <gognweibao@baidu.com>
This commit is contained in:
gongweibao
2026-03-16 21:32:43 +08:00
committed by GitHub
parent d113397b09
commit a6351dea0b
61 changed files with 1595 additions and 171 deletions
+11 -10
View File
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "cuda_multiprocess.h"
#include "helper.h"
#include "paddle/extension.h"
@@ -74,19 +75,19 @@ void SwapCacheImplAllLayers(
auto* cache_cpu_ptr_now =
cache_cpu_ptr + first_cpu_block_id * cache_stride;
if (mode == 0) { // copy from device to host
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_cpu_ptr_now,
cache_gpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyDeviceToHost,
stream);
stream));
} else { // copy from host to device
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_gpu_ptr_now,
cache_cpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyHostToDevice,
stream);
stream));
}
first_gpu_block_id = gpu_block_id;
first_cpu_block_id = cpu_block_id;
@@ -100,22 +101,22 @@ void SwapCacheImplAllLayers(
auto* cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
auto* cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
if (mode == 0) { // copy from device to host
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_cpu_ptr_now,
cache_gpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyDeviceToHost,
stream);
stream));
} else { // copy from host to device
cudaMemcpyAsync(
checkCudaErrors(cudaMemcpyAsync(
cache_gpu_ptr_now,
cache_cpu_ptr_now,
cache_stride * sizeof(DataType_) * consecutive_block_count,
cudaMemcpyHostToDevice,
stream);
stream));
}
}
cudaStreamSynchronize(stream);
checkCudaErrors(cudaStreamSynchronize(stream));
}
void SwapCacheAllLayers(
@@ -126,7 +127,7 @@ void SwapCacheAllLayers(
const std::vector<int64_t>& swap_block_ids_cpu,
int rank,
int mode) {
cudaSetDevice(rank); // used for distributed launch
checkCudaErrors(cudaSetDevice(rank)); // used for distributed launch
assert(cache_gpu_tensors.size() > 0 &&
cache_gpu_tensors.size() == cache_cpu_ptrs.size());
switch (cache_gpu_tensors[0].dtype()) {