[Executor] Default use CUDAGraph (#3594)

* add start intercept

* Adjustment GraphOptConfig

* pre-commit

* default use cudagraph

* set default value

* default use cuda graph

* pre-commit

* fix test case bug

* disable rl

* fix moba attention

* only support gpu

* Temporarily disable PD Disaggregation

* set max_num_seqs of test case as 1

* set max_num_seqs and temperature

* fix max_num_batched_tokens bug

* close cuda graph

* success run wint2

* profile run with max_num_batched_tokens

* 1.add c++ memchecker 2.success run wint2

* updatee a800 yaml

* update docs

* 1. delete check 2. fix plas attn test case

* default use use_unique_memory_pool

* add try-except for warmup

* ban mtp, mm, rl

* fix test case mock

* fix ci bug

* fix form_model_get_output_topp0 bug

* fix ci bug

* refine deepseek ci

* refine code

* Disable PD

* fix sot yaml
This commit is contained in:
RAM
2025-10-21 14:25:45 +08:00
committed by GitHub
parent 99564349a7
commit 775edcc09a
32 changed files with 417 additions and 144 deletions
+97
View File
@@ -13,6 +13,7 @@
// limitations under the License.
#include "helper.h"
#include <nvml.h>
float bfloat16_to_float(__nv_bfloat16 x) {
uint32_t tmp_x = *(reinterpret_cast<uint16_t*>(&x));
@@ -47,3 +48,99 @@ static void PrintMatrix(const T* mat_d,
outfile << ss.str();
outfile.close();
}
GPUMemoryChecker::GPUMemoryChecker() {
nvmlReturn_t result = nvmlInit_v2();
if (NVML_SUCCESS != result) {
throw std::runtime_error("Failed to initialize NVML: " +
std::string(nvmlErrorString(result)));
}
result = nvmlDeviceGetCount_v2(&deviceCount_);
if (NVML_SUCCESS != result) {
nvmlShutdown();
throw std::runtime_error("Failed to get GPU count: " +
std::string(nvmlErrorString(result)));
}
getCUDAVisibleDevice();
}
GPUMemoryChecker::~GPUMemoryChecker() {
nvmlShutdown();
}
void GPUMemoryChecker::getCUDAVisibleDevice(){
std::vector<int> devices;
const char* env_p = std::getenv("CUDA_VISIBLE_DEVICES");
if(!env_p){
for(int i = 0; i < deviceCount_; i++){
visible_device_.push_back(i);
return ;
}
}
std::string env_str(env_p);
std::istringstream stream(env_str);
std::string device_id;
while(std::getline(stream, device_id, ',')){
visible_device_.push_back(std::stoi(device_id));
visible_device_mem_usage_.push_back(-1);
}
std::cout << "\nVisible NVIDIA GPU devices" << env_str << std::endl;
return ;
}
void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) {
try {
for (int i = 0; i < visible_device_.size(); i++) {
unsigned int device_id = visible_device_.at(i);
nvmlDevice_t device;
nvmlReturn_t result = nvmlDeviceGetHandleByIndex_v2(device_id, &device);
if (NVML_SUCCESS != result) {
std::cerr << "Failed to get handle for GPU " << device_id << ": "
<< nvmlErrorString(result) << std::endl;
continue;
}
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
if (NVML_SUCCESS != result) {
std::cerr << "Failed to get name for GPU " << device_id << ": "
<< nvmlErrorString(result) << std::endl;
continue;
}
nvmlMemory_t memoryInfo;
result = nvmlDeviceGetMemoryInfo(device, &memoryInfo);
if (NVML_SUCCESS != result) {
std::cerr << "Failed to get memory info for GPU " << device_id << ": "
<< nvmlErrorString(result) << std::endl;
continue;
}
// Check GPU memory
const char* env_c = std::getenv("MEMCHECKER_CHECK_MEMORY");
if (env_c){
assert(memoryInfo.used <= visible_device_mem_usage_.at(i) && "GPU Memory does not allow growth!");
}
visible_device_mem_usage_[i] = memoryInfo.used;
}
// Check GPU memory
const char* env_p = std::getenv("MEMCHECKER_PRINT_MEMORY");
if (env_p){
std::cout << "\nCall Line: "<< call_line << "\t";
for (int i = 0; i < visible_device_.size(); i++) {
unsigned int device_id = visible_device_.at(i);
std::cout << "GPU " << device_id << ": "
<< " Used memory: " << visible_device_mem_usage_.at(device_id) / (1024 * 1024) << " MB\t";
}
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
}
}
+29
View File
@@ -27,6 +27,9 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <nvml.h>
#include <cassert>
#include <cstdlib>
#include <cstdlib>
#include <cstring>
@@ -618,6 +621,32 @@ inline bool checkAttentionBackend() {
return false;
}
#ifndef GPU_MEMORY_CHECKER_H
#define GPU_MEMORY_CHECKER_H
class GPUMemoryChecker {
public:
static GPUMemoryChecker* getInstance() {
static GPUMemoryChecker instance;
return &instance;
}
void addCheckPoint(const char* call_file, int call_line);
unsigned int getGPUCount() const { return deviceCount_; }
void getCUDAVisibleDevice();
GPUMemoryChecker(const GPUMemoryChecker&) = delete;
void operator=(const GPUMemoryChecker&) = delete;
private:
GPUMemoryChecker();
~GPUMemoryChecker();
unsigned int deviceCount_;
std::vector<unsigned int> visible_device_;
std::vector<unsigned int> visible_device_mem_usage_;
};
#endif // GPU_MEMORY_CHECKER_H
__device__ __forceinline__ float warpReduceMax(float value) {
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 16));
value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 8));
+2 -1
View File
@@ -251,6 +251,7 @@ if paddle.is_compiled_with_rocm():
)
elif paddle.is_compiled_with_cuda():
sources = [
"gpu_ops/helper.cu",
"gpu_ops/save_with_output_msg.cc",
"gpu_ops/get_output.cc",
"gpu_ops/get_output_msg_with_topk.cc",
@@ -499,7 +500,7 @@ elif paddle.is_compiled_with_cuda():
sources=sources,
extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
libraries=["cublasLt"],
extra_link_args=["-lcuda"],
extra_link_args=["-lcuda", "-lnvidia-ml"],
),
packages=find_packages(where="third_party/DeepGEMM"),
package_dir={"": "third_party/DeepGEMM"},