[Executor] Default use CUDAGraph (#3594)

* add start intercept * Adjustment GraphOptConfig * pre-commit * default use cudagraph * set default value * default use cuda graph * pre-commit * fix test case bug * disable rl * fix moba attention * only support gpu * Temporarily disable PD Disaggregation * set max_num_seqs of test case as 1 * set max_num_seqs and temperature * fix max_num_batched_tokens bug * close cuda graph * success run wint2 * profile run with max_num_batched_tokens * 1.add c++ memchecker 2.success run wint2 * updatee a800 yaml * update docs * 1. delete check 2. fix plas attn test case * default use use_unique_memory_pool * add try-except for warmup * ban mtp, mm, rl * fix test case mock * fix ci bug * fix form_model_get_output_topp0 bug * fix ci bug * refine deepseek ci * refine code * Disable PD * fix sot yaml
2026-04-23 00:17:25 +08:00 · 2025-10-21 14:25:45 +08:00
parent 99564349a7
commit 775edcc09a
32 changed files with 417 additions and 144 deletions
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "helper.h"
+#include <nvml.h>

 float bfloat16_to_float(__nv_bfloat16 x) {
    uint32_t tmp_x = *(reinterpret_cast<uint16_t*>(&x));
@@ -47,3 +48,99 @@ static void PrintMatrix(const T* mat_d,
    outfile << ss.str();
    outfile.close();
 }
+
+GPUMemoryChecker::GPUMemoryChecker() {
+    nvmlReturn_t result = nvmlInit_v2();
+    if (NVML_SUCCESS != result) {
+        throw std::runtime_error("Failed to initialize NVML: " +
+                               std::string(nvmlErrorString(result)));
+    }
+
+    result = nvmlDeviceGetCount_v2(&deviceCount_);
+    if (NVML_SUCCESS != result) {
+        nvmlShutdown();
+        throw std::runtime_error("Failed to get GPU count: " +
+                               std::string(nvmlErrorString(result)));
+    }
+
+    getCUDAVisibleDevice();
+}
+
+GPUMemoryChecker::~GPUMemoryChecker() {
+    nvmlShutdown();
+}
+
+void GPUMemoryChecker::getCUDAVisibleDevice(){
+    std::vector<int> devices;
+    const char* env_p = std::getenv("CUDA_VISIBLE_DEVICES");
+    if(!env_p){
+        for(int i = 0; i < deviceCount_; i++){
+            visible_device_.push_back(i);
+            return ;
+        }
+    }
+
+    std::string env_str(env_p);
+    std::istringstream stream(env_str);
+    std::string device_id;
+
+    while(std::getline(stream, device_id, ',')){
+        visible_device_.push_back(std::stoi(device_id));
+        visible_device_mem_usage_.push_back(-1);
+    }
+    std::cout << "\nVisible NVIDIA GPU devices" << env_str << std::endl;
+    return ;
+}
+
+void GPUMemoryChecker::addCheckPoint(const char* call_file, int call_line) {
+    try {
+
+
+        for (int i = 0; i < visible_device_.size(); i++) {
+            unsigned int device_id = visible_device_.at(i);
+            nvmlDevice_t device;
+            nvmlReturn_t result = nvmlDeviceGetHandleByIndex_v2(device_id, &device);
+            if (NVML_SUCCESS != result) {
+                std::cerr << "Failed to get handle for GPU " << device_id << ": "
+                          << nvmlErrorString(result) << std::endl;
+                continue;
+            }
+
+            char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+            result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+            if (NVML_SUCCESS != result) {
+                std::cerr << "Failed to get name for GPU " << device_id << ": "
+                          << nvmlErrorString(result) << std::endl;
+                continue;
+            }
+
+            nvmlMemory_t memoryInfo;
+            result = nvmlDeviceGetMemoryInfo(device, &memoryInfo);
+            if (NVML_SUCCESS != result) {
+                std::cerr << "Failed to get memory info for GPU " << device_id << ": "
+                          << nvmlErrorString(result) << std::endl;
+                continue;
+            }
+
+            // Check GPU memory
+            const char* env_c = std::getenv("MEMCHECKER_CHECK_MEMORY");
+            if (env_c){
+                assert(memoryInfo.used <= visible_device_mem_usage_.at(i) && "GPU Memory does not allow growth!");
+            }
+            visible_device_mem_usage_[i] = memoryInfo.used;
+        }
+
+        // Check GPU memory
+        const char* env_p = std::getenv("MEMCHECKER_PRINT_MEMORY");
+        if (env_p){
+            std::cout << "\nCall Line: "<< call_line << "\t";
+            for (int i = 0; i < visible_device_.size(); i++) {
+                unsigned int device_id = visible_device_.at(i);
+                std::cout << "GPU " << device_id << ": "
+                      << "  Used memory: " << visible_device_mem_usage_.at(device_id) / (1024 * 1024) << " MB\t";
+            }
+        }
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+    }
+}
@@ -27,6 +27,9 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <nvml.h>
+#include <cassert>
+#include <cstdlib>
 #include <cstdlib>
 #include <cstring>

@@ -618,6 +621,32 @@ inline bool checkAttentionBackend() {
  return false;
 }

+#ifndef GPU_MEMORY_CHECKER_H
+#define GPU_MEMORY_CHECKER_H
+class GPUMemoryChecker {
+public:
+    static GPUMemoryChecker* getInstance() {
+        static GPUMemoryChecker instance;
+        return &instance;
+    }
+
+    void addCheckPoint(const char* call_file, int call_line);
+    unsigned int getGPUCount() const { return deviceCount_; }
+    void getCUDAVisibleDevice();
+
+    GPUMemoryChecker(const GPUMemoryChecker&) = delete;
+    void operator=(const GPUMemoryChecker&) = delete;
+
+private:
+    GPUMemoryChecker();
+    ~GPUMemoryChecker();
+
+    unsigned int deviceCount_;
+    std::vector<unsigned int> visible_device_;
+    std::vector<unsigned int> visible_device_mem_usage_;
+};
+
+#endif // GPU_MEMORY_CHECKER_H
 __device__ __forceinline__ float warpReduceMax(float value) {
  value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 16));
  value = fmaxf(value, __shfl_xor_sync(0xffffffff, value, 8));
@@ -251,6 +251,7 @@ if paddle.is_compiled_with_rocm():
    )
 elif paddle.is_compiled_with_cuda():
    sources = [
+        "gpu_ops/helper.cu",
        "gpu_ops/save_with_output_msg.cc",
        "gpu_ops/get_output.cc",
        "gpu_ops/get_output_msg_with_topk.cc",
@@ -499,7 +500,7 @@ elif paddle.is_compiled_with_cuda():
            sources=sources,
            extra_compile_args={"cxx": cc_compile_args, "nvcc": nvcc_compile_args},
            libraries=["cublasLt"],
-            extra_link_args=["-lcuda"],
+            extra_link_args=["-lcuda", "-lnvidia-ml"],
        ),
        packages=find_packages(where="third_party/DeepGEMM"),
        package_dir={"": "third_party/DeepGEMM"},