From 209e5cf7f47955aaf128a0936adb76b8c6efdaa8 Mon Sep 17 00:00:00 2001 From: xiegegege <46314656+xiegegege@users.noreply.github.com> Date: Thu, 26 Mar 2026 20:01:05 +0800 Subject: [PATCH] [CE]add 21b mooncake yaml (#7033) * [CE]add 21b cpu cache ,glm mtp,glm for rl config * [CE]add 21b tp2 yaml * [CE]add 21b mooncake yaml * add fastdeploy benchmark,paddletest-155 * [CE] adjust vl wint4 config * [CE]add glm mtp with updatemodel config * [CE]fix * fix * test * test * test --------- Co-authored-by: xiegegege <> --- .../yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml | 10 ++++++++++ .../yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml | 5 +++++ benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml | 1 + 3 files changed, 16 insertions(+) create mode 100644 benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml create mode 100644 benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml diff --git a/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml b/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml new file mode 100644 index 0000000000..69e9fd1823 --- /dev/null +++ b/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml @@ -0,0 +1,10 @@ +max_model_len: 32768 +max_num_seqs: 128 +tensor_parallel_size: 4 +graph_optimization_config: + use_cudagraph: True + draft_model_use_cudagraph: True +load_choices: "default_v1" +dynamic_load_weight: True +load_strategy: ipc_snapshot +shutdown_comm_group_if_worker_idle: False diff --git a/benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml b/benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml new file mode 100644 index 0000000000..0021a04f56 --- /dev/null +++ b/benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml @@ -0,0 +1,5 @@ +max_model_len: 131072 +max_num_seqs: 256 +tensor_parallel_size: 2 +kvcache_storage_backend: "mooncake" +enable_output_caching: True diff --git a/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml b/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml index 1a53f9b9a9..751bd70e07 100644 --- a/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml +++ b/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml @@ -7,3 +7,4 @@ tensor_parallel_size: 8 quantization: wint4 limit_mm_per_prompt: '{"image": 100, "video": 100}' reasoning_parser: ernie-45-vl +max_num_batched_tokens: 4096