mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Iluvatar] Optimize decode group_gemm and Support cuda graph for ernie (#6803)
This commit is contained in:
@@ -7,7 +7,7 @@ on:
|
|||||||
description: "Build Images"
|
description: "Build Images"
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0"
|
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312"
|
||||||
FASTDEPLOY_ARCHIVE_URL:
|
FASTDEPLOY_ARCHIVE_URL:
|
||||||
description: "URL of the compressed FastDeploy code archive."
|
description: "URL of the compressed FastDeploy code archive."
|
||||||
required: true
|
required: true
|
||||||
|
|||||||
@@ -19,5 +19,5 @@ jobs:
|
|||||||
needs: [clone]
|
needs: [clone]
|
||||||
uses: ./.github/workflows/_iluvatar_cases.yml
|
uses: ./.github/workflows/_iluvatar_cases.yml
|
||||||
with:
|
with:
|
||||||
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||||
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }}
|
||||||
|
|||||||
@@ -0,0 +1,71 @@
|
|||||||
|
// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#include "helper.h"
|
||||||
|
#include "iluvatar_context.h"
|
||||||
|
|
||||||
|
void __global__ restore_from_prefix_sum_kernel(const int64_t* prefix_sum,
|
||||||
|
int64_t* tokens_per_expert,
|
||||||
|
const int num_experts) {
|
||||||
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (idx < num_experts) {
|
||||||
|
if (idx == 0) {
|
||||||
|
tokens_per_expert[idx] = prefix_sum[idx];
|
||||||
|
} else {
|
||||||
|
tokens_per_expert[idx] = prefix_sum[idx] - prefix_sum[idx - 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> RestoreTokensPerExpert(
|
||||||
|
const paddle::Tensor& tokens_expert_prefix_sum) {
|
||||||
|
const auto& prefix_sum_dims = tokens_expert_prefix_sum.dims();
|
||||||
|
PADDLE_ENFORCE_EQ(prefix_sum_dims.size(),
|
||||||
|
1,
|
||||||
|
common::errors::InvalidArgument(
|
||||||
|
"tokens_expert_prefix_sum dims is [num_experts]"));
|
||||||
|
|
||||||
|
const int num_experts = prefix_sum_dims[0];
|
||||||
|
auto stream = tokens_expert_prefix_sum.stream();
|
||||||
|
auto tokens_per_expert = GetEmptyTensor({num_experts},
|
||||||
|
tokens_expert_prefix_sum.dtype(),
|
||||||
|
tokens_expert_prefix_sum.place());
|
||||||
|
|
||||||
|
const int block_size = 128;
|
||||||
|
const int grid_size = (num_experts + block_size - 1) / block_size;
|
||||||
|
restore_from_prefix_sum_kernel<<<grid_size, block_size, 0, stream>>>(
|
||||||
|
const_cast<int64_t*>(tokens_expert_prefix_sum.data<int64_t>()),
|
||||||
|
tokens_per_expert.data<int64_t>(),
|
||||||
|
num_experts);
|
||||||
|
|
||||||
|
return {tokens_per_expert};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<int64_t>> RestoreTokensPerExpertInferShape(
|
||||||
|
const std::vector<int64_t>& tokens_expert_prefix_sum_shape) {
|
||||||
|
return {tokens_expert_prefix_sum_shape};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<paddle::DataType> RestoreTokensPerExpertInferDtype(
|
||||||
|
const paddle::DataType& tokens_expert_prefix_sum_dtype) {
|
||||||
|
return {tokens_expert_prefix_sum_dtype};
|
||||||
|
}
|
||||||
|
|
||||||
|
PD_BUILD_STATIC_OP(restore_tokens_per_expert)
|
||||||
|
.Inputs({"tokens_expert_prefix_sum"})
|
||||||
|
.Outputs({"tokens_per_expert"})
|
||||||
|
.SetKernelFn(PD_KERNEL(RestoreTokensPerExpert))
|
||||||
|
.SetInferShapeFn(PD_INFER_SHAPE(RestoreTokensPerExpertInferShape))
|
||||||
|
.SetInferDtypeFn(PD_INFER_DTYPE(RestoreTokensPerExpertInferDtype));
|
||||||
@@ -45,7 +45,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
|||||||
PD_CHECK(ws_dims[1] == n);
|
PD_CHECK(ws_dims[1] == n);
|
||||||
PD_CHECK(prefix_sum_dims[0] == n_experts);
|
PD_CHECK(prefix_sum_dims[0] == n_experts);
|
||||||
|
|
||||||
PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT64);
|
PD_CHECK(prefix_sum.dtype() == paddle::DataType::INT32);
|
||||||
PD_CHECK(x.dtype() == paddle::DataType::BFLOAT16 ||
|
PD_CHECK(x.dtype() == paddle::DataType::BFLOAT16 ||
|
||||||
x.dtype() == paddle::DataType::FLOAT16);
|
x.dtype() == paddle::DataType::FLOAT16);
|
||||||
PD_CHECK(weight.dtype() == paddle::DataType::INT8);
|
PD_CHECK(weight.dtype() == paddle::DataType::INT8);
|
||||||
@@ -54,14 +54,7 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
|||||||
PD_CHECK(weight.is_contiguous());
|
PD_CHECK(weight.is_contiguous());
|
||||||
PD_CHECK(weight_scale.is_contiguous());
|
PD_CHECK(weight_scale.is_contiguous());
|
||||||
|
|
||||||
// const int64_t* prefix_sum_ptr = prefix_sum.data<int64_t>();
|
|
||||||
auto output = GetEmptyTensor({m, n}, x.dtype(), x.place());
|
auto output = GetEmptyTensor({m, n}, x.dtype(), x.place());
|
||||||
// int16_t* out_data = static_cast<int16_t*>(output.data());
|
|
||||||
// const int16_t* x_data = static_cast<const int16_t*>(x.data());
|
|
||||||
// const int8_t* weight_data = weight.data<int8_t>();
|
|
||||||
// const int16_t* weight_scale_data =
|
|
||||||
// static_cast<const int16_t*>(weight_scale.data());
|
|
||||||
|
|
||||||
cuinferHandle_t handle = iluvatar::getContextInstance()->getIxInferHandle();
|
cuinferHandle_t handle = iluvatar::getContextInstance()->getIxInferHandle();
|
||||||
cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
|
cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
|
||||||
cuinferOperation_t transa = CUINFER_OP_T;
|
cuinferOperation_t transa = CUINFER_OP_T;
|
||||||
@@ -81,16 +74,20 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
|||||||
cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
|
cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
|
||||||
|
|
||||||
cuinferQuantGEMMHostParam cust_host_param;
|
cuinferQuantGEMMHostParam cust_host_param;
|
||||||
|
cuinferCustomGemmHostParamInit(&cust_host_param);
|
||||||
cust_host_param.size = sizeof(cuinferQuantGEMMHostParam);
|
cust_host_param.size = sizeof(cuinferQuantGEMMHostParam);
|
||||||
cust_host_param.persistent = 0;
|
cust_host_param.persistent = 0;
|
||||||
cust_host_param.groupSize = group_size;
|
cust_host_param.groupSize = group_size;
|
||||||
cust_host_param.strideScaleA = n;
|
// cust_host_param.strideScaleA = n;
|
||||||
cust_host_param.expertCount = n_experts;
|
cust_host_param.expertCount = n_experts;
|
||||||
|
cust_host_param.type = 2;
|
||||||
|
|
||||||
cuinferQuantGEMMDeviceParam cust_device_param;
|
cuinferQuantGEMMDeviceParam cust_device_param;
|
||||||
|
cust_device_param.size = sizeof(cuinferQuantGEMMDeviceParam);
|
||||||
|
cust_device_param.sortedId = nullptr;
|
||||||
cust_device_param.bias = nullptr;
|
cust_device_param.bias = nullptr;
|
||||||
cust_device_param.scale = reinterpret_cast<const void*>(weight_scale.data());
|
cust_device_param.scale = weight_scale.data();
|
||||||
cust_device_param.nSize = reinterpret_cast<const void*>(prefix_sum.data());
|
cust_device_param.nSize = prefix_sum.data<int32_t>();
|
||||||
|
|
||||||
int lda = k;
|
int lda = k;
|
||||||
int ldb = k;
|
int ldb = k;
|
||||||
@@ -123,36 +120,35 @@ std::vector<paddle::Tensor> GroupGemv(const paddle::Tensor& x,
|
|||||||
cust_device_param.workspace = nullptr;
|
cust_device_param.workspace = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUINFER_CHECK(
|
CUINFER_CHECK(cuinferCustomGemmEx(handle,
|
||||||
cuinferCustomGemmEx(handle,
|
stream,
|
||||||
stream,
|
cuinfer_ptr_mode,
|
||||||
cuinfer_ptr_mode,
|
transa,
|
||||||
transa,
|
transb,
|
||||||
transb,
|
n,
|
||||||
n,
|
m,
|
||||||
m,
|
k,
|
||||||
k,
|
&alpha,
|
||||||
&alpha,
|
weight.data(),
|
||||||
reinterpret_cast<const void*>(weight.data()),
|
Atype,
|
||||||
Atype,
|
lda,
|
||||||
lda,
|
0,
|
||||||
0, // lda,
|
x.data(),
|
||||||
reinterpret_cast<const void*>(x.data()),
|
Btype,
|
||||||
Btype,
|
ldb,
|
||||||
ldb,
|
0,
|
||||||
0, // ldb,
|
&beta,
|
||||||
&beta,
|
output.data(),
|
||||||
reinterpret_cast<void*>(output.data()),
|
Ctype,
|
||||||
Ctype,
|
ldc,
|
||||||
ldc,
|
0,
|
||||||
0, // ldc,
|
batch_count,
|
||||||
batch_count,
|
computeType,
|
||||||
computeType,
|
scaleType,
|
||||||
scaleType,
|
&cust_host_param,
|
||||||
&cust_host_param,
|
&cust_device_param,
|
||||||
&cust_device_param,
|
customOption,
|
||||||
customOption,
|
cust_device_param.workspace));
|
||||||
cust_device_param.workspace));
|
|
||||||
return {output};
|
return {output};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -579,6 +579,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
|||||||
"iluvatar_ops/mixed_fused_attn.cu",
|
"iluvatar_ops/mixed_fused_attn.cu",
|
||||||
"iluvatar_ops/w8a16_group_gemm.cu",
|
"iluvatar_ops/w8a16_group_gemm.cu",
|
||||||
"iluvatar_ops/w8a16_group_gemv.cu",
|
"iluvatar_ops/w8a16_group_gemv.cu",
|
||||||
|
"iluvatar_ops/restore_tokens_per_expert.cu",
|
||||||
"iluvatar_ops/runtime/iluvatar_context.cc",
|
"iluvatar_ops/runtime/iluvatar_context.cc",
|
||||||
"iluvatar_ops/cpp_extensions.cc",
|
"iluvatar_ops/cpp_extensions.cc",
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
|
|||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||||
```
|
```
|
||||||
|
|
||||||
## 3. Container Preparation
|
## 3. Container Preparation
|
||||||
### 3.1 Start Container
|
### 3.1 Start Container
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||||
docker exec -it paddle_infer bash
|
docker exec -it paddle_infer bash
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -79,7 +79,7 @@ prompts = [
|
|||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||||
|
|
||||||
# load the model
|
# load the model
|
||||||
graph_optimization_config = {"use_cudagraph": False}
|
graph_optimization_config = {"use_cudagraph": True}
|
||||||
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
|
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
|
||||||
|
|
||||||
# Perform batch inference
|
# Perform batch inference
|
||||||
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 8 \
|
--max-num-seqs 8 \
|
||||||
--block-size 16 \
|
--block-size 16 \
|
||||||
--graph-optimization-config '{"use_cudagraph": false}'
|
--graph-optimization-config '{"use_cudagraph": true}'
|
||||||
```
|
```
|
||||||
If you want to use v0 loader, please set `--load-choices "default"`.
|
If you want to use v0 loader, please set `--load-choices "default"`.
|
||||||
|
|
||||||
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
|||||||
```bash
|
```bash
|
||||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||||
```
|
```
|
||||||
It takes about 52 minutes to run the GSM8K dataset.
|
It takes about 26 minutes to run the GSM8K dataset.
|
||||||
|
|
||||||
```
|
```
|
||||||
Accuracy: 0.914
|
Accuracy: 0.914
|
||||||
Invaild: 0.000
|
Invaild: 0.000
|
||||||
Latency: 3143.301 s
|
Latency: 1539.625 s
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
|
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
|
||||||
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--tool-call-parser ernie_x1 \
|
--tool-call-parser ernie_x1 \
|
||||||
--max-num-seqs 8 \
|
--max-num-seqs 8 \
|
||||||
--block-size 16 \
|
--block-size 16 \
|
||||||
--graph-optimization-config '{"use_cudagraph": false}'
|
--graph-optimization-config '{"use_cudagraph": true}'
|
||||||
```
|
```
|
||||||
|
|
||||||
client:
|
client:
|
||||||
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 8 \
|
--max-num-seqs 8 \
|
||||||
--block-size 16 \
|
--block-size 16 \
|
||||||
--graph-optimization-config '{"use_cudagraph": false}'
|
--graph-optimization-config '{"use_cudagraph": true}'
|
||||||
```
|
```
|
||||||
If you want to use v0 loader, please set `--load-choices "default"`.
|
If you want to use v0 loader, please set `--load-choices "default"`.
|
||||||
|
|
||||||
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
|||||||
```bash
|
```bash
|
||||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||||
```
|
```
|
||||||
It takes about 52 minutes to run the GSM8K dataset.
|
The accuracy of the GSM8K dataset is about `0.962`.
|
||||||
|
|
||||||
```
|
|
||||||
Accuracy: 0.962
|
|
||||||
Invaild: 0.000
|
|
||||||
Latency: 17332.728 s
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4.2 ERNIE-4.5-VL series
|
### 4.2 ERNIE-4.5-VL series
|
||||||
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
|
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
|
||||||
|
|||||||
@@ -16,14 +16,14 @@ modinfo iluvatar |grep description
|
|||||||
Pull the Docker image
|
Pull the Docker image
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||||
```
|
```
|
||||||
|
|
||||||
## 3. 准备容器
|
## 3. 准备容器
|
||||||
### 3.1 启动容器
|
### 3.1 启动容器
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0
|
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260312
|
||||||
docker exec -it paddle_infer bash
|
docker exec -it paddle_infer bash
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -79,7 +79,7 @@ prompts = [
|
|||||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
|
||||||
|
|
||||||
# load the model
|
# load the model
|
||||||
graph_optimization_config = {"use_cudagraph": False}
|
graph_optimization_config = {"use_cudagraph": True}
|
||||||
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
|
llm = LLM(model="/home/paddle/ERNIE-4.5-21B-A3B-Paddle", tensor_parallel_size=1, max_model_len=8192, block_size=16, quantization='wint8', graph_optimization_config=graph_optimization_config)
|
||||||
|
|
||||||
# Perform batch inference
|
# Perform batch inference
|
||||||
@@ -147,7 +147,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 8 \
|
--max-num-seqs 8 \
|
||||||
--block-size 16 \
|
--block-size 16 \
|
||||||
--graph-optimization-config '{"use_cudagraph": false}
|
--graph-optimization-config '{"use_cudagraph": true}
|
||||||
```
|
```
|
||||||
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
|
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
|
||||||
|
|
||||||
@@ -177,12 +177,12 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
|||||||
```bash
|
```bash
|
||||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||||
```
|
```
|
||||||
推理整个GSM8K数据集大概需要52分钟。
|
推理整个GSM8K数据集大概需要26分钟。
|
||||||
|
|
||||||
```
|
```
|
||||||
Accuracy: 0.914
|
Accuracy: 0.914
|
||||||
Invaild: 0.000
|
Invaild: 0.000
|
||||||
Latency: 3143.301 s
|
Latency: 1539.625 s
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
|
#### 4.1.2 ERNIE-4.5-21B-A3B-Thinking
|
||||||
@@ -206,7 +206,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--tool-call-parser ernie_x1 \
|
--tool-call-parser ernie_x1 \
|
||||||
--max-num-seqs 8 \
|
--max-num-seqs 8 \
|
||||||
--block-size 16 \
|
--block-size 16 \
|
||||||
--graph-optimization-config '{"use_cudagraph": false}
|
--graph-optimization-config '{"use_cudagraph": true}
|
||||||
```
|
```
|
||||||
|
|
||||||
客户端:
|
客户端:
|
||||||
@@ -241,7 +241,7 @@ python3 -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 8 \
|
--max-num-seqs 8 \
|
||||||
--block-size 16 \
|
--block-size 16 \
|
||||||
--graph-optimization-config '{"use_cudagraph": false}
|
--graph-optimization-config '{"use_cudagraph": true}
|
||||||
```
|
```
|
||||||
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
|
如果想切换到 v0 loader, 请设置 `--load-choices "default"`。
|
||||||
|
|
||||||
@@ -271,13 +271,7 @@ cp FastDeploy/tests/ci_use/iluvatar_UT/bench_gsm8k.py .
|
|||||||
```bash
|
```bash
|
||||||
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
python3 -u bench_gsm8k.py --port 8180 --num-questions 1319 --num-shots 5 --parallel 8
|
||||||
```
|
```
|
||||||
推理整个GSM8K数据集大概需要4.8个小时。
|
推理整个GSM8K数据集的精度大概是`0.962`。
|
||||||
|
|
||||||
```
|
|
||||||
Accuracy: 0.962
|
|
||||||
Invaild: 0.000
|
|
||||||
Latency: 17332.728 s
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4.2 ERNIE-4.5-VL系列
|
### 4.2 ERNIE-4.5-VL系列
|
||||||
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
|
#### 4.2.1 ERNIE-4.5-VL-28B-A3B-Paddle
|
||||||
|
|||||||
@@ -22,12 +22,14 @@ from paddle.nn.quant import weight_only_linear
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from fastdeploy.model_executor.ops.iluvatar import (
|
from fastdeploy.model_executor.ops.iluvatar import (
|
||||||
|
restore_tokens_per_expert,
|
||||||
w8a16_group_gemm,
|
w8a16_group_gemm,
|
||||||
w8a16_group_gemv,
|
w8a16_group_gemv,
|
||||||
)
|
)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
w8a16_group_gemm = None
|
w8a16_group_gemm = None
|
||||||
w8a16_group_gemv = None
|
w8a16_group_gemv = None
|
||||||
|
restore_tokens_per_expert = None
|
||||||
|
|
||||||
|
|
||||||
def group_gemm(
|
def group_gemm(
|
||||||
@@ -80,13 +82,14 @@ def group_gemm(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _select_group_gemm_algo(moe_phase: str):
|
def _pre_process_expert_ffn(moe_phase: str, tokens_expert_prefix_sum: paddle.Tensor):
|
||||||
# if moe_phase == "decode":
|
if moe_phase == "decode":
|
||||||
if False:
|
|
||||||
group_gemm_func = w8a16_group_gemv
|
group_gemm_func = w8a16_group_gemv
|
||||||
|
tokens_per_expert = restore_tokens_per_expert(tokens_expert_prefix_sum).to("int32")
|
||||||
else:
|
else:
|
||||||
group_gemm_func = w8a16_group_gemm
|
group_gemm_func = w8a16_group_gemm
|
||||||
return group_gemm_func
|
tokens_per_expert = tokens_expert_prefix_sum
|
||||||
|
return group_gemm_func, tokens_per_expert
|
||||||
|
|
||||||
|
|
||||||
def iluvatar_moe_expert_ffn(
|
def iluvatar_moe_expert_ffn(
|
||||||
@@ -110,8 +113,8 @@ def iluvatar_moe_expert_ffn(
|
|||||||
assert expert_idx_per_token is None
|
assert expert_idx_per_token is None
|
||||||
assert quant_method in ("weight_only_int8")
|
assert quant_method in ("weight_only_int8")
|
||||||
assert not used_in_ep_low_latency
|
assert not used_in_ep_low_latency
|
||||||
group_gemm_func = _select_group_gemm_algo(moe_phase)
|
group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum)
|
||||||
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_expert_prefix_sum, -1)
|
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1)
|
||||||
act_out = swiglu(ffn1_output)
|
act_out = swiglu(ffn1_output)
|
||||||
output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_expert_prefix_sum, -1)
|
output = group_gemm_func(act_out, down_proj_weight, down_proj_scale, tokens_per_expert, -1)
|
||||||
return output
|
return output
|
||||||
|
|||||||
@@ -198,7 +198,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 8 \
|
--max-num-seqs 8 \
|
||||||
--block-size 16 \
|
--block-size 16 \
|
||||||
--graph-optimization-config '{"use_cudagraph": false}' > server.log 2>&1 &
|
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
|
||||||
|
|
||||||
check_server_status
|
check_server_status
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user