From 209e5cf7f47955aaf128a0936adb76b8c6efdaa8 Mon Sep 17 00:00:00 2001
From: xiegegege <46314656+xiegegege@users.noreply.github.com>
Date: Thu, 26 Mar 2026 20:01:05 +0800
Subject: [PATCH] [CE]add 21b mooncake yaml (#7033)

* [CE]add 21b cpu cache ,glm mtp,glm for rl config

* [CE]add 21b tp2 yaml

* [CE]add 21b mooncake yaml

* add fastdeploy benchmark,paddletest-155

* [CE] adjust vl wint4 config

* [CE]add glm mtp with updatemodel config

* [CE]fix

* fix

* test

* test

* test

---------

Co-authored-by: xiegegege <>
---
 .../yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml       | 10 ++++++++++
 .../yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml       |  5 +++++
 benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml        |  1 +
 3 files changed, 16 insertions(+)
 create mode 100644 benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml
 create mode 100644 benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml

diff --git a/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml b/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml
new file mode 100644
index 0000000000..69e9fd1823
--- /dev/null
+++ b/benchmarks/yaml/GLM45-air-32k-bf16-mtp-updatemodel.yaml
@@ -0,0 +1,10 @@
+max_model_len: 32768
+max_num_seqs: 128
+tensor_parallel_size: 4
+graph_optimization_config:
+  use_cudagraph: True
+  draft_model_use_cudagraph: True
+load_choices: "default_v1"
+dynamic_load_weight: True
+load_strategy: ipc_snapshot
+shutdown_comm_group_if_worker_idle: False
diff --git a/benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml b/benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml
new file mode 100644
index 0000000000..0021a04f56
--- /dev/null
+++ b/benchmarks/yaml/eb45-21b-a3b-32k-bf16-tp2-mooncake.yaml
@@ -0,0 +1,5 @@
+max_model_len: 131072
+max_num_seqs: 256
+tensor_parallel_size: 2
+kvcache_storage_backend: "mooncake"
+enable_output_caching: True
diff --git a/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml b/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
index 1a53f9b9a9..751bd70e07 100644
--- a/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
+++ b/benchmarks/yaml/eb45-vl-32k-wint4-a800-tp8.yaml
@@ -7,3 +7,4 @@ tensor_parallel_size: 8
 quantization: wint4
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
 reasoning_parser: ernie-45-vl
+max_num_batched_tokens: 4096