mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[Iluvatar] Optimize decode group_gemm and Support cuda graph for ernie (#6803)
This commit is contained in:
@@ -7,7 +7,7 @@ on:
|
||||
description: "Build Images"
|
||||
required: true
|
||||
type: string
|
||||
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0"
|
||||
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312"
|
||||
FASTDEPLOY_ARCHIVE_URL:
|
||||
description: "URL of the compressed FastDeploy code archive."
|
||||
required: true
|
||||
|
||||
@@ -19,5 +19,5 @@ jobs:
|
||||
needs: [clone]
|
||||
uses: ./.github/workflows/_iluvatar_cases.yml
|
||||
with:
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "helper.h"
|
||||
#include "iluvatar_context.h"
|
||||
|
||||
void __global__ restore_from_prefix_sum_kernel(const int64_t* prefix_sum,
|
||||
int64_t* tokens_per_expert,
|
||||
const int num_experts) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx < num_experts) {
|
||||
if (idx == 0) {
|
||||
tokens_per_expert[idx] = prefix_sum[idx];
|
||||
} else {
|
||||
tokens_per_expert[idx] = prefix_sum[idx] - prefix_sum[idx - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<paddle::Tensor> RestoreTokensPerExpert(
|
||||
const paddle::Tensor& tokens_expert_prefix_sum) {
|
||||
const auto& prefix_sum_dims = tokens_expert_prefix_sum.dims();
|
||||
PADDLE_ENFORCE_EQ(prefix_sum_dims.size(),
|
||||
1,
|
||||
common::errors::InvalidArgument(
|
||||
"tokens_expert_prefix_sum dims is [num_experts]"));
|
||||
|
||||
const int num_experts = prefix_sum_dims[0];
|
||||
auto stream = tokens_expert_prefix_sum.stream();
|
||||
auto tokens_per_expert = GetEmptyTensor({num_experts},
|
||||
tokens_expert_prefix_sum.dtype(),
|
||||
tokens_expert_prefix_sum.place());
|
||||
|
||||
const int block_size = 128;
|
||||
const int grid_size = (num_experts + block_size - 1) / block_size;
|
||||
restore_from_prefix_sum_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||
const_cast<int64_t*>(tokens_expert_prefix_sum.data<int64_t>()),
|
||||
tokens_per_expert.data<int64_t>(),
|
||||
num_experts);
|
||||
|
||||
return {tokens_per_expert};
|
||||
}
|
||||
|
||||
std::vector<std::vector<int64_t>> RestoreTokensPerExpertInferShape(
|
||||
const std::vector<int64_t>& tokens_expert_prefix_sum_shape) {
|
||||
return {tokens_expert_prefix_sum_shape};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> RestoreTokensPerExpertInferDtype(
|
||||
const paddle::DataType& tokens_expert_prefix_sum_dtype) {
|
||||
return {tokens_expert_prefix_sum_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(restore_tokens_per_expert)
|
||||
.Inputs({"tokens_expert_prefix_sum"})
|
||||
.Outputs({"tokens_per_expert"})
|
||||
.SetKernelFn(PD_KERNEL(RestoreTokensPerExpert))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(RestoreTokensPerExpertInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(RestoreTokensPerExpertInferDtype));
|
||||
@@ -45,7 +45,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
||||
PD_CHECK(ws_dims[1] == n);
|
||||
PD_CHECK(prefix_sum_dims[0] == n_experts);
|
||||
|
||||
PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT64);
|
||||
PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT32);
|
||||
PD_CHECK(x.dtype() == paddle::DataType::BFLOAT16 ||
|
||||
x.dtype() == paddle::DataType::FLOAT16);
|
||||
PD_CHECK(weight.dtype() == paddle::DataType::INT8);
|
||||
@@ -54,14 +54,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
||||
PD_CHECK(weight.is_contiguous());
|
||||
PD_CHECK(weight_scale.is_contiguous());
|
||||
|
||||
// const int64_t* prefix_sum_ptr = prefix_sum.data<int64_t>();
|
||||
auto output = GetEmptyTensor({m, n}, x.dtype(), x.place());
|
||||
// int16_t* out_data = static_cast<int16_t*>(output.data());
|
||||
// const int16_t* x_data = static_cast<const int16_t*>(x.data());
|
||||
// const int8_t* weight_data = weight.data<int8_t>();
|
||||
// const int16_t* weight_scale_data =
|
||||
// static_cast<const int16_t*>(weight_scale.data());
|
||||
|
||||
cuinferHandle_t handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||
cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
|
||||
cuinferOperation_t transa = CUINFER_OP_T;
|
||||
@@ -81,16 +74,20 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
||||
cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
|
||||
|
||||
cuinferQuantGEMMHostParam cust_host_param;
|
||||
cuinferCustomGemmHostParamInit(&cust_host_param);
|
||||
cust_host_param.size = sizeof(cuinferQuantGEMMHostParam);
|
||||
cust_host_param.persistent = 0;
|
||||
cust_host_param.groupSize = group_size;
|
||||
cust_host_param.strideScaleA = n;
|
||||
// cust_host_param.strideScaleA = n;
|
||||
cust_host_param.expertCount = n_experts;
|
||||
cust_host_param.type = 2;
|
||||
|
||||
cuinferQuantGEMMDeviceParam cust_device_param;
|
||||
cust_device_param.size = sizeof(cuinferQuantGEMMDeviceParam);
|
||||
cust_device_param.sortedId = nullptr;
|
||||
cust_device_param.bias = nullptr;
|
||||
cust_device_param.scale = reinterpret_cast<const void*>(weight_scale.data());
|
||||
cust_device_param.nSize = reinterpret_cast<const void*>(prefix_sum.data());
|
||||
cust_device_param.scale = weight_scale.data();
|
||||
cust_device_param.nSize = prefix_sum.data<int32_t>();
|
||||
|
||||
int lda = k;
|
||||
int ldb = k;
|
||||
@@ -123,36 +120,35 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
||||
cust_device_param.workspace = nullptr;
|
||||
}
|
||||
|
||||
CUINFER_CHECK(
|
||||
cuinferCustomGemmEx(handle,
|
||||
stream,
|
||||
cuinfer_ptr_mode,
|
||||
transa,
|
||||
transb,
|
||||
n,
|
||||
m,
|
||||
k,
|
||||
&alpha,
|
||||
reinterpret_cast<const void*>(weight.data()),
|
||||
Atype,
|
||||
lda,
|
||||
0, // lda,
|
||||
reinterpret_cast<const void*>(x.data()),
|
||||
Btype,
|
||||
ldb,
|
||||
0, // ldb,
|
||||
&beta,
|
||||
reinterpret_cast<void*>(output.data()),
|
||||
Ctype,
|
||||
ldc,
|
||||
0, // ldc,
|
||||
batch_count,
|
||||
computeType,
|
||||
scaleType,
|
||||
&cust_host_param,
|
||||
&cust_device_param,
|
||||
customOption,
|
||||
cust_device_param.workspace));
|
||||
CUINFER_CHECK(cuinferCustomGemmEx(handle,
|
||||
stream,
|
||||
cuinfer_ptr_mode,
|
||||
transa,
|
||||
transb,
|
||||
n,
|
||||
m,
|
||||
k,
|
||||
&alpha,
|
||||
weight.data(),
|
||||
Atype,
|
||||
lda,
|
||||
0,
|
||||
x.data(),
|
||||
Btype,
|
||||
ldb,
|
||||
0,
|
||||
&beta,
|
||||
output.data(),
|
||||
Ctype,
|
||||
ldc,
|
||||
0,
|
||||
batch_count,
|
||||
computeType,
|
||||
scaleType,
|
||||
&cust_host_param,
|
||||
&cust_device_param,
|
||||
customOption,
|
||||
cust_device_param.workspace));
|
||||
return {output};
|
||||
}
|
||||
|
||||
|
||||
@@ -579,6 +579,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
||||
"iluvatar_ops/mixed_fused_attn.cu",
|
||||
"iluvatar_ops/w8a16_group_gemm.cu",
|
||||
"iluvatar_ops/w8a16_group_gemv.cu",
|
||||
"iluvatar_ops/restore_tokens_per_expert.cu",
|
||||
"iluvatar_ops/runtime/iluvatar_context.cc",
|
||||
"iluvatar_ops/cpp_extensions.cc",
|
||||
],
|
||||
|
||||
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
|
||||
Pull the Docker image
|
||||
|
||||
```bash
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||
```
|
||||
|
||||
## 3. Container Preparation
|
||||
### 3.1 Start Container
|
||||
|
||||
```bash
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||
docker exec -it paddle_infer bash
|
||||
```
|
||||
|
||||
@@ -79,7 +79,7 @@ prompts = [
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||
|
||||
# load the model
|
||||
graph_optimization_config = {"use_cudagraph": False}
|
||||
graph_optimization_config = {"use_cudagraph": True}
|
||||
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
|
||||
|
||||
# Perform batch inference
|
||||
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": false}'
|
||||
--graph-optimization-config '{"use_cudagraph": true}'
|
||||
```
|
||||
If you want to use v0 loader, please set `--load-choices "default"`.
|
||||
|
||||
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
||||
```bash
|
||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||
```
|
||||
It takes about 52 minutes to run the GSM8K dataset.
|
||||
It takes about 26 minutes to run the GSM8K dataset.
|
||||
|
||||
```
|
||||
Accuracy: 0.914
|
||||
Invaild: 0.000
|
||||
Latency: 3143.301 s
|
||||
Latency: 1539.625 s
|
||||
```
|
||||
|
||||
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
|
||||
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--tool-call-parser ernie_x1 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": false}'
|
||||
--graph-optimization-config '{"use_cudagraph": true}'
|
||||
```
|
||||
|
||||
client:
|
||||
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": false}'
|
||||
--graph-optimization-config '{"use_cudagraph": true}'
|
||||
```
|
||||
If you want to use v0 loader, please set `--load-choices "default"`.
|
||||
|
||||
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
||||
```bash
|
||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||
```
|
||||
It takes about 52 minutes to run the GSM8K dataset.
|
||||
|
||||
```
|
||||
Accuracy: 0.962
|
||||
Invaild: 0.000
|
||||
Latency: 17332.728 s
|
||||
```
|
||||
The accuracy of the GSM8K dataset is about `0.962`.
|
||||
|
||||
### 4.2 ERNIE-4.5-VL series
|
||||
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
|
||||
|
||||
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
|
||||
Pull the Docker image
|
||||
|
||||
```bash
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||
```
|
||||
|
||||
## 3. 准备容器
|
||||
### 3.1 启动容器
|
||||
|
||||
```bash
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||
docker exec -it paddle_infer bash
|
||||
```
|
||||
|
||||
@@ -79,7 +79,7 @@ prompts = [
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||
|
||||
# load the model
|
||||
graph_optimization_config = {"use_cudagraph": False}
|
||||
graph_optimization_config = {"use_cudagraph": True}
|
||||
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
|
||||
|
||||
# Perform batch inference
|
||||
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": false}
|
||||
--graph-optimization-config '{"use_cudagraph": true}
|
||||
```
|
||||
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
|
||||
|
||||
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
||||
```bash
|
||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||
```
|
||||
推理整个GSM8K数据集大概需要52分钟。
|
||||
推理整个GSM8K数据集大概需要26分钟。
|
||||
|
||||
```
|
||||
Accuracy: 0.914
|
||||
Invaild: 0.000
|
||||
Latency: 3143.301 s
|
||||
Latency: 1539.625 s
|
||||
```
|
||||
|
||||
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
|
||||
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--tool-call-parser ernie_x1 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": false}
|
||||
--graph-optimization-config '{"use_cudagraph": true}
|
||||
```
|
||||
|
||||
客户端:
|
||||
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": false}
|
||||
--graph-optimization-config '{"use_cudagraph": true}
|
||||
```
|
||||
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
|
||||
|
||||
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
||||
```bash
|
||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||
```
|
||||
推理整个GSM8K数据集大概需要4.8个小时。
|
||||
|
||||
```
|
||||
Accuracy: 0.962
|
||||
Invaild: 0.000
|
||||
Latency: 17332.728 s
|
||||
```
|
||||
推理整个GSM8K数据集的精度大概是`0.962`。
|
||||
|
||||
### 4.2 ERNIE-4.5-VL系列
|
||||
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
|
||||
|
||||
@@ -22,12 +22,14 @@ from paddle.nn.quant import weight_only_linear
|
||||
|
||||
try:
|
||||
from fastdeploy.model_executor.ops.iluvatar import (
|
||||
restore_tokens_per_expert,
|
||||
w8a16_group_gemm,
|
||||
w8a16_group_gemv,
|
||||
)
|
||||
except ImportError:
|
||||
w8a16_group_gemm = None
|
||||
w8a16_group_gemv = None
|
||||
restore_tokens_per_expert = None
|
||||
|
||||
|
||||
def group_gemm(
|
||||
@@ -80,13 +82,14 @@ def group_gemm(
|
||||
)
|
||||
|
||||
|
||||
def _select_group_gemm_algo(moe_phase: str):
|
||||
# if moe_phase == "decode":
|
||||
if False:
|
||||
def _pre_process_expert_ffn(moe_phase: str, tokens_expert_prefix_sum: paddle.Tensor):
|
||||
if moe_phase == "decode":
|
||||
group_gemm_func = w8a16_group_gemv
|
||||
tokens_per_expert = restore_tokens_per_expert(tokens_expert_prefix_sum).to("int32")
|
||||
else:
|
||||
group_gemm_func = w8a16_group_gemm
|
||||
return group_gemm_func
|
||||
tokens_per_expert = tokens_expert_prefix_sum
|
||||
return group_gemm_func, tokens_per_expert
|
||||
|
||||
|
||||
def iluvatar_moe_expert_ffn(
|
||||
@@ -110,8 +113,8 @@ def iluvatar_moe_expert_ffn(
|
||||
assert expert_idx_per_token is None
|
||||
assert quant_method in ("weight_only_int8")
|
||||
assert not used_in_ep_low_latency
|
||||
group_gemm_func = _select_group_gemm_algo(moe_phase)
|
||||
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_expert_prefix_sum, -1)
|
||||
group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum)
|
||||
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1)
|
||||
act_out = swiglu(ffn1_output)
|
||||
output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_expert_prefix_sum, -1)
|
||||
output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_per_expert, -1)
|
||||
return output
|
||||
|
||||
@@ -198,7 +198,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": false}' > server.log 2>&1 &
|
||||
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
|
||||
|
||||
check_server_status
|
||||
|
||||
|
||||
Reference in New Issue
Block a user