mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Executor] Default use CUDAGraph (#3594)
* add start intercept * Adjustment GraphOptConfig * pre-commit * default use cudagraph * set default value * default use cuda graph * pre-commit * fix test case bug * disable rl * fix moba attention * only support gpu * Temporarily disable PD Disaggregation * set max_num_seqs of test case as 1 * set max_num_seqs and temperature * fix max_num_batched_tokens bug * close cuda graph * success run wint2 * profile run with max_num_batched_tokens * 1.add c++ memchecker 2.success run wint2 * updatee a800 yaml * update docs * 1. delete check 2. fix plas attn test case * default use use_unique_memory_pool * add try-except for warmup * ban mtp, mm, rl * fix test case mock * fix ci bug * fix form_model_get_output_topp0 bug * fix ci bug * refine deepseek ci * refine code * Disable PD * fix sot yaml
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
#include "helper.h"
|
||||
#include <nvml.h>
|
||||
|
||||
float bfloat16_to_float(__nv_bfloat16 x) {
|
||||
uint32_t tmp_x = *(reinterpret_cast<uint16_t*>(&x));
|
||||
@@ -47,3 +48,99 @@ static void PrintMatrix(const T* mat_d,
|
||||
outfile << ss.str();
|
||||
outfile.close();
|
||||
}
|
||||
|
||||
GPUMemoryChecker::GPUMemoryChecker() {
|
||||
nvmlReturn_t result = nvmlInit_v2();
|
||||
if (NVML_SUCCESS != result) {
|
||||
throw std::runtime_error("Failed to initialize NVML: " +
|
||||
std::string(nvmlErrorString(result)));
|
||||
}
|
||||
|
||||
result = nvmlDeviceGetCount_v2(&deviceCount_);
|
||||
if (NVML_SUCCESS != result) {
|
||||
nvmlShutdown();
|
||||
throw std::runtime_error("Failed to get GPU count: " +
|
||||
std::string(nvmlErrorString(result)));
|
||||
}
|
||||
|
||||
getCUDAVisibleDevice();
|
||||
}
|
||||
|
||||
GPUMemoryChecker::~GPUMemoryChecker() {
|
||||
nvmlShutdown();
|
||||
}
|
||||
|
||||
void GPUMemoryChecker::getCUDAVisibleDevice(){
|
||||
std::vector<int> devices;
|
||||
const char* env_p = std::getenv("CUDA_VISIBLE_DEVICES");
|
||||
if(!env_p){
|
||||
for(int i = 0; i < deviceCount_; i++){
|
||||
visible_device_.push_back(i);
|
||||
return ;
|
||||
}
|
||||
}
|
||||
|
||||
std::string env_str(env_p);
|
||||
std::istringstream stream(env_str);
|
||||
std::string device_id;
|
||||
|
||||
while(std::getline(stream, device_id, ',')){
|
||||
visible_device_.push_back(std::stoi(device_id));
|
||||
visible_device_mem_usage_.push_back(-1);
|
||||
}
|
||||
std::cout << "\nVisible NVIDIA GPU devices" << env_str << std::endl;
|
||||
return ;
|
||||
}
|
||||
|
||||
void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) {
|
||||
try {
|
||||
|
||||
|
||||
for (int i = 0; i < visible_device_.size(); i++) {
|
||||
unsigned int device_id = visible_device_.at(i);
|
||||
nvmlDevice_t device;
|
||||
nvmlReturn_t result = nvmlDeviceGetHandleByIndex_v2(device_id, &device);
|
||||
if (NVML_SUCCESS != result) {
|
||||
std::cerr << "Failed to get handle for GPU " << device_id << ": "
|
||||
<< nvmlErrorString(result) << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
|
||||
result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
|
||||
if (NVML_SUCCESS != result) {
|
||||
std::cerr << "Failed to get name for GPU " << device_id << ": "
|
||||
<< nvmlErrorString(result) << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
nvmlMemory_t memoryInfo;
|
||||
result = nvmlDeviceGetMemoryInfo(device, &memoryInfo);
|
||||
if (NVML_SUCCESS != result) {
|
||||
std::cerr << "Failed to get memory info for GPU " << device_id << ": "
|
||||
<< nvmlErrorString(result) << std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check GPU memory
|
||||
const char* env_c = std::getenv("MEMCHECKER_CHECK_MEMORY");
|
||||
if (env_c){
|
||||
assert(memoryInfo.used <= visible_device_mem_usage_.at(i) && "GPU Memory does not allow growth!");
|
||||
}
|
||||
visible_device_mem_usage_[i] = memoryInfo.used;
|
||||
}
|
||||
|
||||
// Check GPU memory
|
||||
const char* env_p = std::getenv("MEMCHECKER_PRINT_MEMORY");
|
||||
if (env_p){
|
||||
std::cout << "\nCall Line: "<< call_line << "\t";
|
||||
for (int i = 0; i < visible_device_.size(); i++) {
|
||||
unsigned int device_id = visible_device_.at(i);
|
||||
std::cout << "GPU " << device_id << ": "
|
||||
<< " Used memory: " << visible_device_mem_usage_.at(device_id) / (1024 * 1024) << " MB\t";
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Error: " << e.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,9 @@
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <nvml.h>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
@@ -618,6 +621,32 @@ inline bool checkAttentionBackend() {
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef GPU_MEMORY_CHECKER_H
|
||||
#define GPU_MEMORY_CHECKER_H
|
||||
class GPUMemoryChecker {
|
||||
public:
|
||||
static GPUMemoryChecker* getInstance() {
|
||||
static GPUMemoryChecker instance;
|
||||
return &instance;
|
||||
}
|
||||
|
||||
void addCheckPoint(const char* call_file, int call_line);
|
||||
unsigned int getGPUCount() const { return deviceCount_; }
|
||||
void getCUDAVisibleDevice();
|
||||
|
||||
GPUMemoryChecker(const GPUMemoryChecker&) = delete;
|
||||
void operator=(const GPUMemoryChecker&) = delete;
|
||||
|
||||
private:
|
||||
GPUMemoryChecker();
|
||||
~GPUMemoryChecker();
|
||||
|
||||
unsigned int deviceCount_;
|
||||
std::vector<unsigned int> visible_device_;
|
||||
std::vector<unsigned int> visible_device_mem_usage_;
|
||||
};
|
||||
|
||||
#endif // GPU_MEMORY_CHECKER_H
|
||||
__device__ __forceinline__ float warpReduceMax(float value) {
|
||||
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 16));
|
||||
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 8));
|
||||
|
||||
@@ -251,6 +251,7 @@ if paddle.is_compiled_with_rocm():
|
||||
)
|
||||
elif paddle.is_compiled_with_cuda():
|
||||
sources = [
|
||||
"gpu_ops/helper.cu",
|
||||
"gpu_ops/save_with_output_msg.cc",
|
||||
"gpu_ops/get_output.cc",
|
||||
"gpu_ops/get_output_msg_with_topk.cc",
|
||||
@@ -499,7 +500,7 @@ elif paddle.is_compiled_with_cuda():
|
||||
sources=sources,
|
||||
extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
|
||||
libraries=["cublasLt"],
|
||||
extra_link_args=["-lcuda"],
|
||||
extra_link_args=["-lcuda", "-lnvidia-ml"],
|
||||
),
|
||||
packages=find_packages(where="third_party/DeepGEMM"),
|
||||
package_dir={"": "third_party/DeepGEMM"},
|
||||
|
||||
Reference in New Issue
Block a user