[Iluvatar] Optimize decode group_gemm and Support cuda graph for ernie (#6803)

2026-04-23 00:17:25 +08:00 · 2026-03-12 19:21:17 +08:00
parent 250ce40b40
commit 901b38c936
9 changed files with 140 additions and 81 deletions
@@ -7,7 +7,7 @@ on:
        description: "Build Images"
        required: true
        type: string
-        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0"
+        default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312"
      FASTDEPLOY_ARCHIVE_URL:
        description: "URL of the compressed FastDeploy code archive."
        required: true
@@ -19,5 +19,5 @@ jobs:
    needs: [clone]
    uses: ./.github/workflows/_iluvatar_cases.yml
    with:
-      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
+      DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
      FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
@@ -0,0 +1,71 @@
 // Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "helper.h"
 #include "iluvatar_context.h"
 void __global__ restore_from_prefix_sum_kernel(const int64_t* prefix_sum,
                                               int64_t* tokens_per_expert,
                                               const int num_experts) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < num_experts) {
    if (idx == 0) {
      tokens_per_expert[idx] = prefix_sum[idx];
    } else {
      tokens_per_expert[idx] = prefix_sum[idx] - prefix_sum[idx - 1];
    }
  }
 }
 std::vector<paddle::Tensor> RestoreTokensPerExpert(
    const paddle::Tensor& tokens_expert_prefix_sum) {
  const auto& prefix_sum_dims = tokens_expert_prefix_sum.dims();
  PADDLE_ENFORCE_EQ(prefix_sum_dims.size(),
                    1,
                    common::errors::InvalidArgument(
                        "tokens_expert_prefix_sum dims is [num_experts]"));
  const int num_experts = prefix_sum_dims[0];
  auto stream = tokens_expert_prefix_sum.stream();
  auto tokens_per_expert = GetEmptyTensor({num_experts},
                                          tokens_expert_prefix_sum.dtype(),
                                          tokens_expert_prefix_sum.place());
  const int block_size = 128;
  const int grid_size = (num_experts + block_size - 1) / block_size;
  restore_from_prefix_sum_kernel<<<grid_size, block_size, 0, stream>>>(
      const_cast<int64_t*>(tokens_expert_prefix_sum.data<int64_t>()),
      tokens_per_expert.data<int64_t>(),
      num_experts);
  return {tokens_per_expert};
 }
 std::vector<std::vector<int64_t>> RestoreTokensPerExpertInferShape(
    const std::vector<int64_t>& tokens_expert_prefix_sum_shape) {
  return {tokens_expert_prefix_sum_shape};
 }
 std::vector<paddle::DataType> RestoreTokensPerExpertInferDtype(
    const paddle::DataType& tokens_expert_prefix_sum_dtype) {
  return {tokens_expert_prefix_sum_dtype};
 }
 PD_BUILD_STATIC_OP(restore_tokens_per_expert)
    .Inputs({"tokens_expert_prefix_sum"})
    .Outputs({"tokens_per_expert"})
    .SetKernelFn(PD_KERNEL(RestoreTokensPerExpert))
    .SetInferShapeFn(PD_INFER_SHAPE(RestoreTokensPerExpertInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(RestoreTokensPerExpertInferDtype));
@@ -45,7 +45,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
  PD_CHECK(ws_dims[1] == n);
  PD_CHECK(prefix_sum_dims[0] == n_experts);
-  PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT64);
+  PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT32);
  PD_CHECK(x.dtype() == paddle::DataType::BFLOAT16 ||
           x.dtype() == paddle::DataType::FLOAT16);
  PD_CHECK(weight.dtype() == paddle::DataType::INT8);
@@ -54,14 +54,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
  PD_CHECK(weight.is_contiguous());
  PD_CHECK(weight_scale.is_contiguous());
  //   const int64_t* prefix_sum_ptr = prefix_sum.data<int64_t>();
  auto output = GetEmptyTensor({m, n}, x.dtype(), x.place());
  //   int16_t* out_data = static_cast<int16_t*>(output.data());
  //   const int16_t* x_data = static_cast<const int16_t*>(x.data());
  //   const int8_t* weight_data = weight.data<int8_t>();
  //   const int16_t* weight_scale_data =
  //       static_cast<const int16_t*>(weight_scale.data());
  cuinferHandle_t handle = iluvatar::getContextInstance()->getIxInferHandle();
  cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
  cuinferOperation_t transa = CUINFER_OP_T;
@@ -81,16 +74,20 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
  cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
  cuinferQuantGEMMHostParam cust_host_param;
  cuinferCustomGemmHostParamInit(&cust_host_param);
  cust_host_param.size = sizeof(cuinferQuantGEMMHostParam);
  cust_host_param.persistent = 0;
  cust_host_param.groupSize = group_size;
-  cust_host_param.strideScaleA = n;
+  // cust_host_param.strideScaleA = n;
  cust_host_param.expertCount = n_experts;
  cust_host_param.type = 2;
  cuinferQuantGEMMDeviceParam cust_device_param;
  cust_device_param.size = sizeof(cuinferQuantGEMMDeviceParam);
  cust_device_param.sortedId = nullptr;
  cust_device_param.bias = nullptr;
-  cust_device_param.scale = reinterpret_cast<const void*>(weight_scale.data());
+  cust_device_param.scale = weight_scale.data();
-  cust_device_param.nSize = reinterpret_cast<const void*>(prefix_sum.data());
+  cust_device_param.nSize = prefix_sum.data<int32_t>();
  int lda = k;
  int ldb = k;
@@ -123,36 +120,35 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
    cust_device_param.workspace = nullptr;
  }
-  CUINFER_CHECK(
+  CUINFER_CHECK(cuinferCustomGemmEx(handle,
-      cuinferCustomGemmEx(handle,
+                                    stream,
-                          stream,
+                                    cuinfer_ptr_mode,
-                          cuinfer_ptr_mode,
+                                    transa,
-                          transa,
+                                    transb,
-                          transb,
+                                    n,
-                          n,
+                                    m,
-                          m,
+                                    k,
-                          k,
+                                    &alpha,
-                          &alpha,
+                                    weight.data(),
-                          reinterpret_cast<const void*>(weight.data()),
+                                    Atype,
-                          Atype,
+                                    lda,
-                          lda,
+                                    0,
-                          0,  // lda,
+                                    x.data(),
-                          reinterpret_cast<const void*>(x.data()),
+                                    Btype,
-                          Btype,
+                                    ldb,
-                          ldb,
+                                    0,
-                          0,  // ldb,
+                                    &beta,
-                          &beta,
+                                    output.data(),
-                          reinterpret_cast<void*>(output.data()),
+                                    Ctype,
-                          Ctype,
+                                    ldc,
-                          ldc,
+                                    0,
-                          0,  // ldc,
+                                    batch_count,
-                          batch_count,
+                                    computeType,
-                          computeType,
+                                    scaleType,
-                          scaleType,
+                                    &cust_host_param,
-                          &cust_host_param,
+                                    &cust_device_param,
-                          &cust_device_param,
+                                    customOption,
-                          customOption,
+                                    cust_device_param.workspace));
                          cust_device_param.workspace));
  return {output};
 }
@@ -579,6 +579,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                "iluvatar_ops/mixed_fused_attn.cu",
                "iluvatar_ops/w8a16_group_gemm.cu",
                "iluvatar_ops/w8a16_group_gemv.cu",
                "iluvatar_ops/restore_tokens_per_expert.cu",
                "iluvatar_ops/runtime/iluvatar_context.cc",
                "iluvatar_ops/cpp_extensions.cc",
            ],
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
 Pull the Docker image
 ```bash
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
 ```
 ## 3. Container Preparation
 ### 3.1 Start Container
 ```bash
-docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
+docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
 docker exec -it paddle_infer bash
 ```
@@ -79,7 +79,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
 # load the model
-graph_optimization_config = {"use_cudagraph": False}
+graph_optimization_config = {"use_cudagraph": True}
 llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
 # Perform batch inference
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
       --max-model-len 32768 \
       --max-num-seqs 8 \
       --block-size 16 \
-       --graph-optimization-config '{"use_cudagraph": false}'
+       --graph-optimization-config '{"use_cudagraph": true}'
 ```
 If you want to use v0 loader, please set `--load-choices "default"`.
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
 ```bash
 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
 ```
-It takes about 52 minutes to run the GSM8K dataset.
+It takes about 26 minutes to run the GSM8K dataset.
 ```
 Accuracy: 0.914
 Invaild: 0.000
-Latency: 3143.301 s
+Latency: 1539.625 s
 ```
 #### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
       --tool-call-parser ernie_x1 \
       --max-num-seqs 8 \
       --block-size 16 \
-       --graph-optimization-config '{"use_cudagraph": false}'
+       --graph-optimization-config '{"use_cudagraph": true}'
 ```
 client:
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
       --max-model-len 32768 \
       --max-num-seqs 8 \
       --block-size 16 \
-       --graph-optimization-config '{"use_cudagraph": false}'
+       --graph-optimization-config '{"use_cudagraph": true}'
 ```
 If you want to use v0 loader, please set `--load-choices "default"`.
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
 ```bash
 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
 ```
-It takes about 52 minutes to run the GSM8K dataset.
+The accuracy of the GSM8K dataset is about `0.962`.
 ```
 Accuracy: 0.962
 Invaild: 0.000
 Latency: 17332.728 s
 ```
 ### 4.2 ERNIE-4.5-VL series
 #### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
 Pull the Docker image
 ```bash
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
 ```
 ## 3. 准备容器
 ### 3.1 启动容器
 ```bash
-docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
+docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
 docker exec -it paddle_infer bash
 ```
@@ -79,7 +79,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
 # load the model
-graph_optimization_config = {"use_cudagraph": False}
+graph_optimization_config = {"use_cudagraph": True}
 llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
 # Perform batch inference
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
       --max-model-len 32768 \
       --max-num-seqs 8 \
       --block-size 16 \
-       --graph-optimization-config '{"use_cudagraph": false}
+       --graph-optimization-config '{"use_cudagraph": true}
 ```
 如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
 ```bash
 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
 ```
-推理整个GSM8K数据集大概需要52分钟。
+推理整个GSM8K数据集大概需要26分钟。
 ```
 Accuracy: 0.914
 Invaild: 0.000
-Latency: 3143.301 s
+Latency: 1539.625 s
 ```
 #### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
       --tool-call-parser ernie_x1 \
       --max-num-seqs 8 \
       --block-size 16 \
-       --graph-optimization-config '{"use_cudagraph": false}
+       --graph-optimization-config '{"use_cudagraph": true}
 ```
 客户端:
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
       --max-model-len 32768 \
       --max-num-seqs 8 \
       --block-size 16 \
-       --graph-optimization-config '{"use_cudagraph": false}
+       --graph-optimization-config '{"use_cudagraph": true}
 ```
 如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
 ```bash
 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
 ```
-推理整个GSM8K数据集大概需要4.8个小时。
+推理整个GSM8K数据集的精度大概是`0.962`。
 ```
 Accuracy: 0.962
 Invaild: 0.000
 Latency: 17332.728 s
 ```
 ### 4.2 ERNIE-4.5-VL系列
 #### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
@@ -22,12 +22,14 @@ from paddle.nn.quant import weight_only_linear
 try:
    from fastdeploy.model_executor.ops.iluvatar import (
        restore_tokens_per_expert,
        w8a16_group_gemm,
        w8a16_group_gemv,
    )
 except ImportError:
    w8a16_group_gemm = None
    w8a16_group_gemv = None
    restore_tokens_per_expert = None
 def group_gemm(
@@ -80,13 +82,14 @@ def group_gemm(
        )
-def _select_group_gemm_algo(moe_phase: str):
+def _pre_process_expert_ffn(moe_phase: str, tokens_expert_prefix_sum: paddle.Tensor):
-    # if moe_phase == "decode":
+    if moe_phase == "decode":
    if False:
        group_gemm_func = w8a16_group_gemv
        tokens_per_expert = restore_tokens_per_expert(tokens_expert_prefix_sum).to("int32")
    else:
        group_gemm_func = w8a16_group_gemm
-    return group_gemm_func
+        tokens_per_expert = tokens_expert_prefix_sum
    return group_gemm_func, tokens_per_expert
 def iluvatar_moe_expert_ffn(
@@ -110,8 +113,8 @@ def iluvatar_moe_expert_ffn(
    assert expert_idx_per_token is None
    assert quant_method in ("weight_only_int8")
    assert not used_in_ep_low_latency
-    group_gemm_func = _select_group_gemm_algo(moe_phase)
+    group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum)
-    ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_expert_prefix_sum, -1)
+    ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1)
    act_out = swiglu(ffn1_output)
-    output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_expert_prefix_sum, -1)
+    output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_per_expert, -1)
    return output
@@ -198,7 +198,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
       --max-model-len 32768 \
       --max-num-seqs 8 \
       --block-size 16 \
-       --graph-optimization-config '{"use_cudagraph": false}' > server.log 2>&1 &
+       --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
 check_server_status