[Iluvatar] Optimize decode group_gemm and Support cuda graph for ernie (#6803)

This commit is contained in:
yzwu
2026-03-12 19:21:17 +08:00
committed by GitHub
parent 250ce40b40
commit 901b38c936
9 changed files with 140 additions and 81 deletions
+1 -1
View File
@@ -7,7 +7,7 @@ on:
description: "Build Images" description: "Build Images"
required: true required: true
type: string type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0" default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312"
FASTDEPLOY_ARCHIVE_URL: FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive." description: "URL of the compressed FastDeploy code archive."
required: true required: true
+1 -1
View File
@@ -19,5 +19,5 @@ jobs:
needs: [clone] needs: [clone]
uses: ./.github/workflows/_iluvatar_cases.yml uses: ./.github/workflows/_iluvatar_cases.yml
with: with:
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
@@ -0,0 +1,71 @@
// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "helper.h"
#include "iluvatar_context.h"
void __global__ restore_from_prefix_sum_kernel(const int64_t* prefix_sum,
int64_t* tokens_per_expert,
const int num_experts) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < num_experts) {
if (idx == 0) {
tokens_per_expert[idx] = prefix_sum[idx];
} else {
tokens_per_expert[idx] = prefix_sum[idx] - prefix_sum[idx - 1];
}
}
}
std::vector<paddle::Tensor> RestoreTokensPerExpert(
const paddle::Tensor& tokens_expert_prefix_sum) {
const auto& prefix_sum_dims = tokens_expert_prefix_sum.dims();
PADDLE_ENFORCE_EQ(prefix_sum_dims.size(),
1,
common::errors::InvalidArgument(
"tokens_expert_prefix_sum dims is [num_experts]"));
const int num_experts = prefix_sum_dims[0];
auto stream = tokens_expert_prefix_sum.stream();
auto tokens_per_expert = GetEmptyTensor({num_experts},
tokens_expert_prefix_sum.dtype(),
tokens_expert_prefix_sum.place());
const int block_size = 128;
const int grid_size = (num_experts + block_size - 1) / block_size;
restore_from_prefix_sum_kernel<<<grid_size, block_size, 0, stream>>>(
const_cast<int64_t*>(tokens_expert_prefix_sum.data<int64_t>()),
tokens_per_expert.data<int64_t>(),
num_experts);
return {tokens_per_expert};
}
std::vector<std::vector<int64_t>> RestoreTokensPerExpertInferShape(
const std::vector<int64_t>& tokens_expert_prefix_sum_shape) {
return {tokens_expert_prefix_sum_shape};
}
std::vector<paddle::DataType> RestoreTokensPerExpertInferDtype(
const paddle::DataType& tokens_expert_prefix_sum_dtype) {
return {tokens_expert_prefix_sum_dtype};
}
PD_BUILD_STATIC_OP(restore_tokens_per_expert)
.Inputs({"tokens_expert_prefix_sum"})
.Outputs({"tokens_per_expert"})
.SetKernelFn(PD_KERNEL(RestoreTokensPerExpert))
.SetInferShapeFn(PD_INFER_SHAPE(RestoreTokensPerExpertInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(RestoreTokensPerExpertInferDtype));
+37 -41
View File
@@ -45,7 +45,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
PD_CHECK(ws_dims[1] == n); PD_CHECK(ws_dims[1] == n);
PD_CHECK(prefix_sum_dims[0] == n_experts); PD_CHECK(prefix_sum_dims[0] == n_experts);
PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT64); PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT32);
PD_CHECK(x.dtype() == paddle::DataType::BFLOAT16 || PD_CHECK(x.dtype() == paddle::DataType::BFLOAT16 ||
x.dtype() == paddle::DataType::FLOAT16); x.dtype() == paddle::DataType::FLOAT16);
PD_CHECK(weight.dtype() == paddle::DataType::INT8); PD_CHECK(weight.dtype() == paddle::DataType::INT8);
@@ -54,14 +54,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
PD_CHECK(weight.is_contiguous()); PD_CHECK(weight.is_contiguous());
PD_CHECK(weight_scale.is_contiguous()); PD_CHECK(weight_scale.is_contiguous());
// const int64_t* prefix_sum_ptr = prefix_sum.data<int64_t>();
auto output = GetEmptyTensor({m, n}, x.dtype(), x.place()); auto output = GetEmptyTensor({m, n}, x.dtype(), x.place());
// int16_t* out_data = static_cast<int16_t*>(output.data());
// const int16_t* x_data = static_cast<const int16_t*>(x.data());
// const int8_t* weight_data = weight.data<int8_t>();
// const int16_t* weight_scale_data =
// static_cast<const int16_t*>(weight_scale.data());
cuinferHandle_t handle = iluvatar::getContextInstance()->getIxInferHandle(); cuinferHandle_t handle = iluvatar::getContextInstance()->getIxInferHandle();
cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
cuinferOperation_t transa = CUINFER_OP_T; cuinferOperation_t transa = CUINFER_OP_T;
@@ -81,16 +74,20 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
cuinferQuantGEMMHostParam cust_host_param; cuinferQuantGEMMHostParam cust_host_param;
cuinferCustomGemmHostParamInit(&cust_host_param);
cust_host_param.size = sizeof(cuinferQuantGEMMHostParam); cust_host_param.size = sizeof(cuinferQuantGEMMHostParam);
cust_host_param.persistent = 0; cust_host_param.persistent = 0;
cust_host_param.groupSize = group_size; cust_host_param.groupSize = group_size;
cust_host_param.strideScaleA = n; // cust_host_param.strideScaleA = n;
cust_host_param.expertCount = n_experts; cust_host_param.expertCount = n_experts;
cust_host_param.type = 2;
cuinferQuantGEMMDeviceParam cust_device_param; cuinferQuantGEMMDeviceParam cust_device_param;
cust_device_param.size = sizeof(cuinferQuantGEMMDeviceParam);
cust_device_param.sortedId = nullptr;
cust_device_param.bias = nullptr; cust_device_param.bias = nullptr;
cust_device_param.scale = reinterpret_cast<const void*>(weight_scale.data()); cust_device_param.scale = weight_scale.data();
cust_device_param.nSize = reinterpret_cast<const void*>(prefix_sum.data()); cust_device_param.nSize = prefix_sum.data<int32_t>();
int lda = k; int lda = k;
int ldb = k; int ldb = k;
@@ -123,36 +120,35 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
cust_device_param.workspace = nullptr; cust_device_param.workspace = nullptr;
} }
CUINFER_CHECK( CUINFER_CHECK(cuinferCustomGemmEx(handle,
cuinferCustomGemmEx(handle, stream,
stream, cuinfer_ptr_mode,
cuinfer_ptr_mode, transa,
transa, transb,
transb, n,
n, m,
m, k,
k, &alpha,
&alpha, weight.data(),
reinterpret_cast<const void*>(weight.data()), Atype,
Atype, lda,
lda, 0,
0, // lda, x.data(),
reinterpret_cast<const void*>(x.data()), Btype,
Btype, ldb,
ldb, 0,
0, // ldb, &beta,
&beta, output.data(),
reinterpret_cast<void*>(output.data()), Ctype,
Ctype, ldc,
ldc, 0,
0, // ldc, batch_count,
batch_count, computeType,
computeType, scaleType,
scaleType, &cust_host_param,
&cust_host_param, &cust_device_param,
&cust_device_param, customOption,
customOption, cust_device_param.workspace));
cust_device_param.workspace));
return {output}; return {output};
} }
+1
View File
@@ -579,6 +579,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
"iluvatar_ops/mixed_fused_attn.cu", "iluvatar_ops/mixed_fused_attn.cu",
"iluvatar_ops/w8a16_group_gemm.cu", "iluvatar_ops/w8a16_group_gemm.cu",
"iluvatar_ops/w8a16_group_gemv.cu", "iluvatar_ops/w8a16_group_gemv.cu",
"iluvatar_ops/restore_tokens_per_expert.cu",
"iluvatar_ops/runtime/iluvatar_context.cc", "iluvatar_ops/runtime/iluvatar_context.cc",
"iluvatar_ops/cpp_extensions.cc", "iluvatar_ops/cpp_extensions.cc",
], ],
+9 -15
View File
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
Pull the Docker image Pull the Docker image
```bash ```bash
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
``` ```
## 3. Container Preparation ## 3. Container Preparation
### 3.1 Start Container ### 3.1 Start Container
```bash ```bash
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
docker exec -it paddle_infer bash docker exec -it paddle_infer bash
``` ```
@@ -79,7 +79,7 @@ prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
# load the model # load the model
graph_optimization_config = {"use_cudagraph": False} graph_optimization_config = {"use_cudagraph": True}
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config) llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
# Perform batch inference # Perform batch inference
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \ --max-model-len 32768 \
--max-num-seqs 8 \ --max-num-seqs 8 \
--block-size 16 \ --block-size 16 \
--graph-optimization-config '{"use_cudagraph": false}' --graph-optimization-config '{"use_cudagraph": true}'
``` ```
If you want to use v0 loader, please set `--load-choices "default"`. If you want to use v0 loader, please set `--load-choices "default"`.
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
```bash ```bash
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
``` ```
It takes about 52 minutes to run the GSM8K dataset. It takes about 26 minutes to run the GSM8K dataset.
``` ```
Accuracy: 0.914 Accuracy: 0.914
Invaild: 0.000 Invaild: 0.000
Latency: 3143.301 s Latency: 1539.625 s
``` ```
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking #### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
--tool-call-parser ernie_x1 \ --tool-call-parser ernie_x1 \
--max-num-seqs 8 \ --max-num-seqs 8 \
--block-size 16 \ --block-size 16 \
--graph-optimization-config '{"use_cudagraph": false}' --graph-optimization-config '{"use_cudagraph": true}'
``` ```
client: client:
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \ --max-model-len 32768 \
--max-num-seqs 8 \ --max-num-seqs 8 \
--block-size 16 \ --block-size 16 \
--graph-optimization-config '{"use_cudagraph": false}' --graph-optimization-config '{"use_cudagraph": true}'
``` ```
If you want to use v0 loader, please set `--load-choices "default"`. If you want to use v0 loader, please set `--load-choices "default"`.
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
```bash ```bash
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
``` ```
It takes about 52 minutes to run the GSM8K dataset. The accuracy of the GSM8K dataset is about `0.962`.
```
Accuracy: 0.962
Invaild: 0.000
Latency: 17332.728 s
```
### 4.2 ERNIE-4.5-VL series ### 4.2 ERNIE-4.5-VL series
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle #### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
Pull the Docker image Pull the Docker image
```bash ```bash
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
``` ```
## 3. 准备容器 ## 3. 准备容器
### 3.1 启动容器 ### 3.1 启动容器
```bash ```bash
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0 docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
docker exec -it paddle_infer bash docker exec -it paddle_infer bash
``` ```
@@ -79,7 +79,7 @@ prompts = [
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
# load the model # load the model
graph_optimization_config = {"use_cudagraph": False} graph_optimization_config = {"use_cudagraph": True}
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config) llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
# Perform batch inference # Perform batch inference
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \ --max-model-len 32768 \
--max-num-seqs 8 \ --max-num-seqs 8 \
--block-size 16 \ --block-size 16 \
--graph-optimization-config '{"use_cudagraph": false} --graph-optimization-config '{"use_cudagraph": true}
``` ```
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。 如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
```bash ```bash
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
``` ```
推理整个GSM8K数据集大概需要52分钟。 推理整个GSM8K数据集大概需要26分钟。
``` ```
Accuracy: 0.914 Accuracy: 0.914
Invaild: 0.000 Invaild: 0.000
Latency: 3143.301 s Latency: 1539.625 s
``` ```
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking #### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
--tool-call-parser ernie_x1 \ --tool-call-parser ernie_x1 \
--max-num-seqs 8 \ --max-num-seqs 8 \
--block-size 16 \ --block-size 16 \
--graph-optimization-config '{"use_cudagraph": false} --graph-optimization-config '{"use_cudagraph": true}
``` ```
客户端: 客户端:
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \ --max-model-len 32768 \
--max-num-seqs 8 \ --max-num-seqs 8 \
--block-size 16 \ --block-size 16 \
--graph-optimization-config '{"use_cudagraph": false} --graph-optimization-config '{"use_cudagraph": true}
``` ```
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。 如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
```bash ```bash
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8 python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
``` ```
推理整个GSM8K数据集大概需要4.8个小时 推理整个GSM8K数据集的精度大概是`0.962`
```
Accuracy: 0.962
Invaild: 0.000
Latency: 17332.728 s
```
### 4.2 ERNIE-4.5-VL系列 ### 4.2 ERNIE-4.5-VL系列
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle #### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
@@ -22,12 +22,14 @@ from paddle.nn.quant import weight_only_linear
try: try:
from fastdeploy.model_executor.ops.iluvatar import ( from fastdeploy.model_executor.ops.iluvatar import (
restore_tokens_per_expert,
w8a16_group_gemm, w8a16_group_gemm,
w8a16_group_gemv, w8a16_group_gemv,
) )
except ImportError: except ImportError:
w8a16_group_gemm = None w8a16_group_gemm = None
w8a16_group_gemv = None w8a16_group_gemv = None
restore_tokens_per_expert = None
def group_gemm( def group_gemm(
@@ -80,13 +82,14 @@ def group_gemm(
) )
def _select_group_gemm_algo(moe_phase: str): def _pre_process_expert_ffn(moe_phase: str, tokens_expert_prefix_sum: paddle.Tensor):
# if moe_phase == "decode": if moe_phase == "decode":
if False:
group_gemm_func = w8a16_group_gemv group_gemm_func = w8a16_group_gemv
tokens_per_expert = restore_tokens_per_expert(tokens_expert_prefix_sum).to("int32")
else: else:
group_gemm_func = w8a16_group_gemm group_gemm_func = w8a16_group_gemm
return group_gemm_func tokens_per_expert = tokens_expert_prefix_sum
return group_gemm_func, tokens_per_expert
def iluvatar_moe_expert_ffn( def iluvatar_moe_expert_ffn(
@@ -110,8 +113,8 @@ def iluvatar_moe_expert_ffn(
assert expert_idx_per_token is None assert expert_idx_per_token is None
assert quant_method in ("weight_only_int8") assert quant_method in ("weight_only_int8")
assert not used_in_ep_low_latency assert not used_in_ep_low_latency
group_gemm_func = _select_group_gemm_algo(moe_phase) group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum)
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_expert_prefix_sum, -1) ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1)
act_out = swiglu(ffn1_output) act_out = swiglu(ffn1_output)
output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_expert_prefix_sum, -1) output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_per_expert, -1)
return output return output
+1 -1
View File
@@ -198,7 +198,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \ --max-model-len 32768 \
--max-num-seqs 8 \ --max-num-seqs 8 \
--block-size 16 \ --block-size 16 \
--graph-optimization-config '{"use_cudagraph": false}' > server.log 2>&1 & --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
check_server_status check_server_status