From 901b38c936421a9b8494ccb7d8dae4f771afea8c Mon Sep 17 00:00:00 2001 From: yzwu Date: Thu, 12 Mar 2026 19:21:17 +0800 Subject: [PATCH] [Iluvatar] Optimize decode group_gemm and Support cuda graph for ernie (#6803) --- .github/workflows/_iluvatar_cases.yml | 2 +- .github/workflows/ci_iluvatar.yml | 2 +- .../iluvatar_ops/restore_tokens_per_expert.cu | 71 +++++++++++++++++ custom_ops/iluvatar_ops/w8a16_group_gemv.cu | 78 +++++++++---------- custom_ops/setup_ops.py | 1 + docs/get_started/installation/iluvatar_gpu.md | 24 +++--- .../get_started/installation/iluvatar_gpu.md | 24 +++--- .../model_executor/ops/iluvatar/moe_ops.py | 17 ++-- scripts/run_ci_iluvatar.sh | 2 +- 9 files changed, 140 insertions(+), 81 deletions(-) create mode 100644 custom_ops/iluvatar_ops/restore_tokens_per_expert.cu diff --git a/.github/workflows/_iluvatar_cases.yml b/.github/workflows/_iluvatar_cases.yml index b6e884ee84..9fc14ca66f 100644 --- a/.github/workflows/_iluvatar_cases.yml +++ b/.github/workflows/_iluvatar_cases.yml @@ -7,7 +7,7 @@ on: description: "Build Images" required: true type: string - default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0" + default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312" FASTDEPLOY_ARCHIVE_URL: description: "URL of the compressed FastDeploy code archive." required: true diff --git a/.github/workflows/ci_iluvatar.yml b/.github/workflows/ci_iluvatar.yml index 67d6490986..2e5616c15f 100644 --- a/.github/workflows/ci_iluvatar.yml +++ b/.github/workflows/ci_iluvatar.yml @@ -19,5 +19,5 @@ jobs: needs: [clone] uses: ./.github/workflows/_iluvatar_cases.yml with: - DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312 FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} diff --git a/custom_ops/iluvatar_ops/restore_tokens_per_expert.cu b/custom_ops/iluvatar_ops/restore_tokens_per_expert.cu new file mode 100644 index 0000000000..e1c018ae90 --- /dev/null +++ b/custom_ops/iluvatar_ops/restore_tokens_per_expert.cu @@ -0,0 +1,71 @@ +// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "helper.h" +#include "iluvatar_context.h" + +void __global__ restore_from_prefix_sum_kernel(const int64_t* prefix_sum, + int64_t* tokens_per_expert, + const int num_experts) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_experts) { + if (idx == 0) { + tokens_per_expert[idx] = prefix_sum[idx]; + } else { + tokens_per_expert[idx] = prefix_sum[idx] - prefix_sum[idx - 1]; + } + } +} + +std::vector RestoreTokensPerExpert( + const paddle::Tensor& tokens_expert_prefix_sum) { + const auto& prefix_sum_dims = tokens_expert_prefix_sum.dims(); + PADDLE_ENFORCE_EQ(prefix_sum_dims.size(), + 1, + common::errors::InvalidArgument( + "tokens_expert_prefix_sum dims is [num_experts]")); + + const int num_experts = prefix_sum_dims[0]; + auto stream = tokens_expert_prefix_sum.stream(); + auto tokens_per_expert = GetEmptyTensor({num_experts}, + tokens_expert_prefix_sum.dtype(), + tokens_expert_prefix_sum.place()); + + const int block_size = 128; + const int grid_size = (num_experts + block_size - 1) / block_size; + restore_from_prefix_sum_kernel<<>>( + const_cast(tokens_expert_prefix_sum.data()), + tokens_per_expert.data(), + num_experts); + + return {tokens_per_expert}; +} + +std::vector> RestoreTokensPerExpertInferShape( + const std::vector& tokens_expert_prefix_sum_shape) { + return {tokens_expert_prefix_sum_shape}; +} + +std::vector RestoreTokensPerExpertInferDtype( + const paddle::DataType& tokens_expert_prefix_sum_dtype) { + return {tokens_expert_prefix_sum_dtype}; +} + +PD_BUILD_STATIC_OP(restore_tokens_per_expert) + .Inputs({"tokens_expert_prefix_sum"}) + .Outputs({"tokens_per_expert"}) + .SetKernelFn(PD_KERNEL(RestoreTokensPerExpert)) + .SetInferShapeFn(PD_INFER_SHAPE(RestoreTokensPerExpertInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(RestoreTokensPerExpertInferDtype)); diff --git a/custom_ops/iluvatar_ops/w8a16_group_gemv.cu b/custom_ops/iluvatar_ops/w8a16_group_gemv.cu index c7d43b43f6..1b013bd631 100644 --- a/custom_ops/iluvatar_ops/w8a16_group_gemv.cu +++ b/custom_ops/iluvatar_ops/w8a16_group_gemv.cu @@ -45,7 +45,7 @@ std::vector GroupGemv(const paddle::Tensor& x, PD_CHECK(ws_dims[1] == n); PD_CHECK(prefix_sum_dims[0] == n_experts); - PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT64); + PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT32); PD_CHECK(x.dtype() == paddle::DataType::BFLOAT16 || x.dtype() == paddle::DataType::FLOAT16); PD_CHECK(weight.dtype() == paddle::DataType::INT8); @@ -54,14 +54,7 @@ std::vector GroupGemv(const paddle::Tensor& x, PD_CHECK(weight.is_contiguous()); PD_CHECK(weight_scale.is_contiguous()); - // const int64_t* prefix_sum_ptr = prefix_sum.data(); auto output = GetEmptyTensor({m, n}, x.dtype(), x.place()); - // int16_t* out_data = static_cast(output.data()); - // const int16_t* x_data = static_cast(x.data()); - // const int8_t* weight_data = weight.data(); - // const int16_t* weight_scale_data = - // static_cast(weight_scale.data()); - cuinferHandle_t handle = iluvatar::getContextInstance()->getIxInferHandle(); cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; cuinferOperation_t transa = CUINFER_OP_T; @@ -81,16 +74,20 @@ std::vector GroupGemv(const paddle::Tensor& x, cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; cuinferQuantGEMMHostParam cust_host_param; + cuinferCustomGemmHostParamInit(&cust_host_param); cust_host_param.size = sizeof(cuinferQuantGEMMHostParam); cust_host_param.persistent = 0; cust_host_param.groupSize = group_size; - cust_host_param.strideScaleA = n; + // cust_host_param.strideScaleA = n; cust_host_param.expertCount = n_experts; + cust_host_param.type = 2; cuinferQuantGEMMDeviceParam cust_device_param; + cust_device_param.size = sizeof(cuinferQuantGEMMDeviceParam); + cust_device_param.sortedId = nullptr; cust_device_param.bias = nullptr; - cust_device_param.scale = reinterpret_cast(weight_scale.data()); - cust_device_param.nSize = reinterpret_cast(prefix_sum.data()); + cust_device_param.scale = weight_scale.data(); + cust_device_param.nSize = prefix_sum.data(); int lda = k; int ldb = k; @@ -123,36 +120,35 @@ std::vector GroupGemv(const paddle::Tensor& x, cust_device_param.workspace = nullptr; } - CUINFER_CHECK( - cuinferCustomGemmEx(handle, - stream, - cuinfer_ptr_mode, - transa, - transb, - n, - m, - k, - &alpha, - reinterpret_cast(weight.data()), - Atype, - lda, - 0, // lda, - reinterpret_cast(x.data()), - Btype, - ldb, - 0, // ldb, - &beta, - reinterpret_cast(output.data()), - Ctype, - ldc, - 0, // ldc, - batch_count, - computeType, - scaleType, - &cust_host_param, - &cust_device_param, - customOption, - cust_device_param.workspace)); + CUINFER_CHECK(cuinferCustomGemmEx(handle, + stream, + cuinfer_ptr_mode, + transa, + transb, + n, + m, + k, + &alpha, + weight.data(), + Atype, + lda, + 0, + x.data(), + Btype, + ldb, + 0, + &beta, + output.data(), + Ctype, + ldc, + 0, + batch_count, + computeType, + scaleType, + &cust_host_param, + &cust_device_param, + customOption, + cust_device_param.workspace)); return {output}; } diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index 3f2b2e758f..4652e87b95 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -579,6 +579,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): "iluvatar_ops/mixed_fused_attn.cu", "iluvatar_ops/w8a16_group_gemm.cu", "iluvatar_ops/w8a16_group_gemv.cu", + "iluvatar_ops/restore_tokens_per_expert.cu", "iluvatar_ops/runtime/iluvatar_context.cc", "iluvatar_ops/cpp_extensions.cc", ], diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index aa316d4cbf..f9adc7aa09 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -16,14 +16,14 @@ modinfo iluvatar |grep description Pull the Docker image ```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312 ``` ## 3. Container Preparation ### 3.1 Start Container ```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312 docker exec -it paddle_infer bash ``` @@ -79,7 +79,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) # load the model -graph_optimization_config = {"use_cudagraph": False} +graph_optimization_config = {"use_cudagraph": True} llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config) # Perform batch inference @@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \ --max-model-len 32768 \ --max-num-seqs 8 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": false}' + --graph-optimization-config '{"use_cudagraph": true}' ``` If you want to use v0 loader, please set `--load-choices "default"`. @@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . ```bash python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 ``` -It takes about 52 minutes to run the GSM8K dataset. +It takes about 26 minutes to run the GSM8K dataset. ``` Accuracy: 0.914 Invaild: 0.000 -Latency: 3143.301 s +Latency: 1539.625 s ``` #### 4.1.2 ERNIE-4.5-21B-A3B-Thinking @@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \ --tool-call-parser ernie_x1 \ --max-num-seqs 8 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": false}' + --graph-optimization-config '{"use_cudagraph": true}' ``` client: @@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \ --max-model-len 32768 \ --max-num-seqs 8 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": false}' + --graph-optimization-config '{"use_cudagraph": true}' ``` If you want to use v0 loader, please set `--load-choices "default"`. @@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . ```bash python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 ``` -It takes about 52 minutes to run the GSM8K dataset. - -``` -Accuracy: 0.962 -Invaild: 0.000 -Latency: 17332.728 s -``` +The accuracy of the GSM8K dataset is about `0.962`. ### 4.2 ERNIE-4.5-VL series #### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index 0ffb1c22ad..52b56720db 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -16,14 +16,14 @@ modinfo iluvatar |grep description Pull the Docker image ```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312 ``` ## 3. 准备容器 ### 3.1 启动容器 ```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312 docker exec -it paddle_infer bash ``` @@ -79,7 +79,7 @@ prompts = [ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) # load the model -graph_optimization_config = {"use_cudagraph": False} +graph_optimization_config = {"use_cudagraph": True} llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config) # Perform batch inference @@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \ --max-model-len 32768 \ --max-num-seqs 8 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": false} + --graph-optimization-config '{"use_cudagraph": true} ``` 如果想切换到 v0 loader, 请设置 `--load-choices "default"`。 @@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . ```bash python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 ``` -推理整个GSM8K数据集大概需要52分钟。 +推理整个GSM8K数据集大概需要26分钟。 ``` Accuracy: 0.914 Invaild: 0.000 -Latency: 3143.301 s +Latency: 1539.625 s ``` #### 4.1.2 ERNIE-4.5-21B-A3B-Thinking @@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \ --tool-call-parser ernie_x1 \ --max-num-seqs 8 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": false} + --graph-optimization-config '{"use_cudagraph": true} ``` 客户端: @@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \ --max-model-len 32768 \ --max-num-seqs 8 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": false} + --graph-optimization-config '{"use_cudagraph": true} ``` 如果想切换到 v0 loader, 请设置 `--load-choices "default"`。 @@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py . ```bash python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 ``` -推理整个GSM8K数据集大概需要4.8个小时。 - -``` -Accuracy: 0.962 -Invaild: 0.000 -Latency: 17332.728 s -``` +推理整个GSM8K数据集的精度大概是`0.962`。 ### 4.2 ERNIE-4.5-VL系列 #### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle diff --git a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py index eeca284a45..43f216b8b3 100644 --- a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py +++ b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py @@ -22,12 +22,14 @@ from paddle.nn.quant import weight_only_linear try: from fastdeploy.model_executor.ops.iluvatar import ( + restore_tokens_per_expert, w8a16_group_gemm, w8a16_group_gemv, ) except ImportError: w8a16_group_gemm = None w8a16_group_gemv = None + restore_tokens_per_expert = None def group_gemm( @@ -80,13 +82,14 @@ def group_gemm( ) -def _select_group_gemm_algo(moe_phase: str): - # if moe_phase == "decode": - if False: +def _pre_process_expert_ffn(moe_phase: str, tokens_expert_prefix_sum: paddle.Tensor): + if moe_phase == "decode": group_gemm_func = w8a16_group_gemv + tokens_per_expert = restore_tokens_per_expert(tokens_expert_prefix_sum).to("int32") else: group_gemm_func = w8a16_group_gemm - return group_gemm_func + tokens_per_expert = tokens_expert_prefix_sum + return group_gemm_func, tokens_per_expert def iluvatar_moe_expert_ffn( @@ -110,8 +113,8 @@ def iluvatar_moe_expert_ffn( assert expert_idx_per_token is None assert quant_method in ("weight_only_int8") assert not used_in_ep_low_latency - group_gemm_func = _select_group_gemm_algo(moe_phase) - ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_expert_prefix_sum, -1) + group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum) + ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1) act_out = swiglu(ffn1_output) - output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_expert_prefix_sum, -1) + output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_per_expert, -1) return output diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh index 65008f5a0b..71f8f3ce09 100644 --- a/scripts/run_ci_iluvatar.sh +++ b/scripts/run_ci_iluvatar.sh @@ -198,7 +198,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --max-model-len 32768 \ --max-num-seqs 8 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": false}' > server.log 2>&1 & + --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 & check_server_status