mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[BugFix][Optimization] Replace silent failures with catchable exceptions and informative error messages (#6533)
* init * init * fix format * add * add files * add ut * fix some * add ut * add more * add * fix pre-commit * fix pre-commit * fix cover * skip long seq * add * add * fix * remove not need * fix set attr * fix comments * fix comments * fix failed tests --------- Co-authored-by: gongweibao <gognweibao@baidu.com>
This commit is contained in:
@@ -41,6 +41,8 @@
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef PADDLE_WITH_HIP
|
||||
@@ -52,16 +54,14 @@ namespace cub = hipcub;
|
||||
#define GPU(str) cuda##str
|
||||
#endif
|
||||
|
||||
#define checkCudaErrors(call) \
|
||||
do { \
|
||||
GPU(Error_t) err = call; \
|
||||
if (err != GPU(Success)) { \
|
||||
printf("CUDA error at %s %d: %s\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
GPU(GetErrorString)(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
#define checkCudaErrors(call) \
|
||||
do { \
|
||||
GPU(Error_t) err = call; \
|
||||
if (err != GPU(Success)) { \
|
||||
throw std::runtime_error(std::string("CUDA error at ") + __FILE__ + \
|
||||
":" + std::to_string(__LINE__) + " '" + \
|
||||
GPU(GetErrorString)(err) + "'"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
typedef struct shmStruct_st {
|
||||
|
||||
@@ -63,8 +63,8 @@ void decode_alltoall_transpose(paddle::Tensor& inp,
|
||||
auto hidden_size = inp.shape()[1];
|
||||
auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
|
||||
if (reg_buffer) {
|
||||
cudaMemcpyAsync(
|
||||
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream);
|
||||
CUDACHECK(cudaMemcpyAsync(
|
||||
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
reg_buffer = inp.data();
|
||||
}
|
||||
@@ -124,8 +124,8 @@ void all_reduce(paddle::Tensor& inp,
|
||||
auto input_size = inp.numel() * phi::SizeOf(inp.dtype());
|
||||
auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
|
||||
if (reg_buffer) {
|
||||
cudaMemcpyAsync(
|
||||
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream);
|
||||
CUDACHECK(cudaMemcpyAsync(
|
||||
reg_buffer, inp.data(), input_size, cudaMemcpyDeviceToDevice, stream));
|
||||
} else {
|
||||
reg_buffer = inp.data();
|
||||
}
|
||||
|
||||
@@ -22,19 +22,19 @@
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t e = cmd; \
|
||||
if (e != cudaSuccess) { \
|
||||
printf("Failed: Cuda error %s:%d '%s'\n", \
|
||||
__FILE__, \
|
||||
__LINE__, \
|
||||
cudaGetErrorString(e)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t e = cmd; \
|
||||
if (e != cudaSuccess) { \
|
||||
throw std::runtime_error(std::string("CUDA error at ") + __FILE__ + \
|
||||
":" + std::to_string(__LINE__) + " '" + \
|
||||
cudaGetErrorString(e) + "'"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
namespace paddle {
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <cstring>
|
||||
#include "cuda_multiprocess.h"
|
||||
#include "helper.h"
|
||||
|
||||
@@ -40,9 +41,9 @@ std::vector<paddle::Tensor> GetDataPtrIpc(const paddle::Tensor &tmp_input,
|
||||
volatile shmStruct *shm = NULL;
|
||||
sharedMemoryInfo info;
|
||||
if (sharedMemoryOpen2(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
|
||||
printf("Failed to create shared memory slab\n");
|
||||
printf("Func GetDataPtrIpc. Shm_name: %s\n", shm_name.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
throw std::runtime_error(
|
||||
"Failed to open shared memory slab in GetDataPtrIpc, shm_name: " +
|
||||
shm_name + ", errno: " + std::string(strerror(errno)));
|
||||
}
|
||||
shm = (volatile shmStruct *)info.addr;
|
||||
void *ptr = nullptr;
|
||||
|
||||
@@ -14,6 +14,8 @@
|
||||
// limitations under the License.
|
||||
#include <nvml.h>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include "fstream"
|
||||
#include "helper.h"
|
||||
#include "iomanip"
|
||||
@@ -136,7 +138,9 @@ void sent_key_value_by_remote_ptr(
|
||||
#endif
|
||||
cudaError_t err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
throw std::runtime_error(
|
||||
std::string("CUDA Error in IPC KV cache transfer: ") +
|
||||
cudaGetErrorString(err));
|
||||
}
|
||||
#ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT
|
||||
cudaDeviceSynchronize();
|
||||
@@ -325,8 +329,11 @@ void SentKeyValueByRemotePtr(const paddle::Tensor& local_key_tensor,
|
||||
reinterpret_cast<dataT*>((void*)remote_value_ptr),
|
||||
cuda_stream);
|
||||
}
|
||||
default: {
|
||||
PD_THROW("Unsupported dtype for IPC KV cache transfer: ",
|
||||
local_key_tensor.type());
|
||||
}
|
||||
}
|
||||
// using dataT=std::remove_pointer<decltype(local_block_ids_ptr)>;
|
||||
}
|
||||
|
||||
void SentKeyValueByRemotePtrBlockSync(const paddle::Tensor& local_key_tensor,
|
||||
|
||||
@@ -90,13 +90,14 @@ struct AttentionKernelTraits {
|
||||
static constexpr bool USE_TMA_LOAD_KV = USE_TMA_LOAD_KV_;
|
||||
static constexpr int GROUP_SIZE = GROUP_SIZE_;
|
||||
static constexpr int BLOCK_SHAPE_Q = BLOCK_SHAPE_Q_;
|
||||
static_assert(BLOCK_SHAPE_Q % 64 == 0);
|
||||
static_assert(BLOCK_SHAPE_Q % 64 == 0,
|
||||
"BLOCK_SHAPE_Q must be a multiple of 64");
|
||||
static constexpr int BLOCK_SHAPE_KV = BLOCK_SHAPE_KV_;
|
||||
static constexpr int HEAD_DIM_QK = HEAD_DIM_QK_;
|
||||
static constexpr int HEAD_DIM_VO = HEAD_DIM_VO_;
|
||||
static constexpr int NUM_PER_STAGE = BLOCK_SHAPE_KV * HEAD_DIM_QK;
|
||||
static_assert(HEAD_DIM_QK % 32 == 0);
|
||||
static_assert(HEAD_DIM_VO % 32 == 0);
|
||||
static_assert(HEAD_DIM_QK % 32 == 0, "HEAD_DIM_QK must be a multiple of 32");
|
||||
static_assert(HEAD_DIM_VO % 32 == 0, "HEAD_DIM_VO must be a multiple of 32");
|
||||
|
||||
static constexpr int NUM_WARPS = 12;
|
||||
static constexpr int NUM_THREADS = 384;
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include "cuda_multiprocess.h"
|
||||
#include "paddle/extension.h"
|
||||
|
||||
@@ -53,9 +55,9 @@ void ReadDataIpc(const paddle::Tensor &tmp_input,
|
||||
volatile shmStruct *shm = NULL;
|
||||
sharedMemoryInfo info;
|
||||
if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
|
||||
printf("Failed to create shared memory slab\n");
|
||||
printf("Func ReadDataIpc. Shm_name: %s\n", shm_name.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
throw std::runtime_error(
|
||||
"Failed to open shared memory slab in ReadDataIpc, shm_name: " +
|
||||
shm_name + ", errno: " + std::string(strerror(errno)));
|
||||
}
|
||||
shm = (volatile shmStruct *)info.addr;
|
||||
void *ptr = nullptr;
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <cstring>
|
||||
#include "cuda_multiprocess.h"
|
||||
#include "helper.h"
|
||||
|
||||
@@ -85,9 +86,10 @@ void set_data_ipc(const paddle::Tensor& tmp_input,
|
||||
sharedMemoryInfo info;
|
||||
volatile shmStruct* shm = NULL;
|
||||
if (sharedMemoryCreate(shm_name.c_str(), sizeof(*shm), &info) != 0) {
|
||||
printf("Failed to create shared memory slab\n");
|
||||
printf("Func sharedMemoryCreate. Shm_name: %s\n", shm_name.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
throw std::runtime_error(
|
||||
"Failed to create shared memory slab in sharedMemoryCreate, "
|
||||
"shm_name: " +
|
||||
shm_name + ", errno: " + std::string(strerror(errno)));
|
||||
}
|
||||
shm = (volatile shmStruct*)info.addr;
|
||||
memset((void*)shm, 0, sizeof(*shm));
|
||||
|
||||
@@ -12,7 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "helper.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
@@ -22,6 +21,7 @@
|
||||
#include <sys/mman.h>
|
||||
#include <stdio.h>
|
||||
#include "cuda_multiprocess.h"
|
||||
#include "helper.h"
|
||||
#include "paddle/phi/core/tensor_meta.h"
|
||||
|
||||
std::vector<paddle::Tensor> ShareExternalData(paddle::Tensor &input,
|
||||
@@ -30,9 +30,9 @@ std::vector<paddle::Tensor> ShareExternalData(paddle::Tensor &input,
|
||||
volatile shmStruct *shm = NULL;
|
||||
sharedMemoryInfo info;
|
||||
if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
|
||||
printf("Failed to create shared memory slab\n");
|
||||
printf("Func ShareExternalData. Shm_name: %s\n", shm_name.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
throw std::runtime_error(
|
||||
"Failed to open shared memory slab in ShareExternalData, shm_name: " +
|
||||
shm_name + ", errno: " + std::string(strerror(errno)));
|
||||
}
|
||||
shm = (volatile shmStruct *)info.addr;
|
||||
void *ptr = nullptr;
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "cuda_multiprocess.h"
|
||||
#include "helper.h"
|
||||
#include "paddle/extension.h"
|
||||
|
||||
@@ -47,24 +48,24 @@ void SwapCacheImpl(const paddle::Tensor& cache_gpu, // gpu
|
||||
auto* cache_gpu_ptr_now = cache_gpu_ptr + gpu_block_id * cache_stride;
|
||||
auto* cache_cpu_ptr_now = cache_cpu_ptr + cpu_block_id * cache_stride;
|
||||
if (mode == 0) { // copy from device to host
|
||||
cudaMemcpyAsync(cache_cpu_ptr_now,
|
||||
cache_gpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_),
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream);
|
||||
checkCudaErrors(cudaMemcpyAsync(cache_cpu_ptr_now,
|
||||
cache_gpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_),
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream));
|
||||
// cudaMemcpy(cache_dst_ptr_now, cache_src_ptr_now, cache_stride *
|
||||
// sizeof(DataType_), cudaMemcpyDeviceToHost);
|
||||
} else { // copy from host to device
|
||||
cudaMemcpyAsync(cache_gpu_ptr_now,
|
||||
cache_cpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream);
|
||||
checkCudaErrors(cudaMemcpyAsync(cache_gpu_ptr_now,
|
||||
cache_cpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_),
|
||||
cudaMemcpyHostToDevice,
|
||||
stream));
|
||||
// cudaMemcpy(cache_dst_ptr_now, cache_src_ptr_now, cache_stride *
|
||||
// sizeof(DataType_), cudaMemcpyHostToDevice);
|
||||
}
|
||||
}
|
||||
cudaStreamSynchronize(stream);
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void SwapCache(const paddle::Tensor& cache_gpu, // gpu
|
||||
@@ -74,7 +75,7 @@ void SwapCache(const paddle::Tensor& cache_gpu, // gpu
|
||||
const std::vector<int64_t>& swap_block_ids_cpu,
|
||||
int rank,
|
||||
int mode) {
|
||||
cudaSetDevice(rank); // used for distributed launch
|
||||
checkCudaErrors(cudaSetDevice(rank)); // used for distributed launch
|
||||
switch (cache_gpu.dtype()) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
return SwapCacheImpl<paddle::DataType::BFLOAT16>(cache_gpu,
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "cuda_multiprocess.h"
|
||||
#include "helper.h"
|
||||
#include "paddle/extension.h"
|
||||
|
||||
@@ -74,19 +75,19 @@ void SwapCacheImplAllLayers(
|
||||
auto* cache_cpu_ptr_now =
|
||||
cache_cpu_ptr + first_cpu_block_id * cache_stride;
|
||||
if (mode == 0) { // copy from device to host
|
||||
cudaMemcpyAsync(
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
cache_cpu_ptr_now,
|
||||
cache_gpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_) * consecutive_block_count,
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream);
|
||||
stream));
|
||||
} else { // copy from host to device
|
||||
cudaMemcpyAsync(
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
cache_gpu_ptr_now,
|
||||
cache_cpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_) * consecutive_block_count,
|
||||
cudaMemcpyHostToDevice,
|
||||
stream);
|
||||
stream));
|
||||
}
|
||||
first_gpu_block_id = gpu_block_id;
|
||||
first_cpu_block_id = cpu_block_id;
|
||||
@@ -100,22 +101,22 @@ void SwapCacheImplAllLayers(
|
||||
auto* cache_gpu_ptr_now = cache_gpu_ptr + first_gpu_block_id * cache_stride;
|
||||
auto* cache_cpu_ptr_now = cache_cpu_ptr + first_cpu_block_id * cache_stride;
|
||||
if (mode == 0) { // copy from device to host
|
||||
cudaMemcpyAsync(
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
cache_cpu_ptr_now,
|
||||
cache_gpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_) * consecutive_block_count,
|
||||
cudaMemcpyDeviceToHost,
|
||||
stream);
|
||||
stream));
|
||||
} else { // copy from host to device
|
||||
cudaMemcpyAsync(
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
cache_gpu_ptr_now,
|
||||
cache_cpu_ptr_now,
|
||||
cache_stride * sizeof(DataType_) * consecutive_block_count,
|
||||
cudaMemcpyHostToDevice,
|
||||
stream);
|
||||
stream));
|
||||
}
|
||||
}
|
||||
cudaStreamSynchronize(stream);
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void SwapCacheAllLayers(
|
||||
@@ -126,7 +127,7 @@ void SwapCacheAllLayers(
|
||||
const std::vector<int64_t>& swap_block_ids_cpu,
|
||||
int rank,
|
||||
int mode) {
|
||||
cudaSetDevice(rank); // used for distributed launch
|
||||
checkCudaErrors(cudaSetDevice(rank)); // used for distributed launch
|
||||
assert(cache_gpu_tensors.size() > 0 &&
|
||||
cache_gpu_tensors.size() == cache_cpu_ptrs.size());
|
||||
switch (cache_gpu_tensors[0].dtype()) {
|
||||
|
||||
@@ -734,10 +734,12 @@ void TuneCublasltGemm(const paddle::Tensor& K,
|
||||
const bool is_test,
|
||||
const bool is_read_from_file,
|
||||
const std::string& path) {
|
||||
assert(M_end >= M_start);
|
||||
assert(M_start >= 1);
|
||||
assert(K.dims().size() == 1 && N.dims().size() == 1);
|
||||
assert(is_test != is_read_from_file);
|
||||
assert(M_end >= M_start && "M_end must be >= M_start");
|
||||
assert(M_start >= 1 && "M_start must be >= 1");
|
||||
assert(K.dims().size() == 1 && N.dims().size() == 1 &&
|
||||
"K and N must be 1D tensors");
|
||||
assert(is_test != is_read_from_file &&
|
||||
"Exactly one of is_test or is_read_from_file must be true");
|
||||
|
||||
auto K_cpu = K.copy_to(paddle::CPUPlace(), false);
|
||||
auto N_cpu = N.copy_to(paddle::CPUPlace(), false);
|
||||
@@ -746,7 +748,7 @@ void TuneCublasltGemm(const paddle::Tensor& K,
|
||||
|
||||
int K_size = K.numel();
|
||||
int N_size = N.numel();
|
||||
assert(K_size == N_size);
|
||||
assert(K_size == N_size && "K and N must have the same number of elements");
|
||||
|
||||
std::vector<int> mm;
|
||||
int m = M_start, step = 1;
|
||||
@@ -796,7 +798,7 @@ void TuneCublasltGemm(const paddle::Tensor& K,
|
||||
path);
|
||||
} else {
|
||||
// other dtype
|
||||
throw std::runtime_error(dtype + "not currently supported");
|
||||
throw std::runtime_error(dtype + " is not currently supported");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user