From 0ddb6e461c4c45be0395466aa080cee57a6558ff Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Mon, 13 Apr 2026 22:07:41 +0800
Subject: [PATCH] =?UTF-8?q?[Optimization]=20=E7=A7=BB=E9=99=A4=20num=5Fblo?=
 =?UTF-8?q?cks=20=E4=B8=8A=E9=99=90=E9=99=90=E5=88=B6=20(#7241)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                   | 1 +
 fastdeploy/worker/iluvatar_worker.py         | 5 -----
 fastdeploy/worker/worker_process.py          | 5 -----
 tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py | 2 +-
 tests/e2e/test_EB_VL_Lite_serving.py         | 2 +-
 5 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2b35f3a83b..32c8f1a6cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,6 +173,7 @@ custom_ops/tmp*
 build
 
 .ccls-cache
+.claude
 
 third_party
 
diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py
index 44be900bb7..eed2169644 100644
--- a/fastdeploy/worker/iluvatar_worker.py
+++ b/fastdeploy/worker/iluvatar_worker.py
@@ -126,11 +126,6 @@ class IluvatarPaddleDisWorkerProc(PaddleDisWorkerProc):
             # 2. Calculate the appropriate number of blocks
             model_block_memory_used = self.worker.cal_theortical_kvcache()
             num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
-            # NOTE(liuzichang): Too many block will lead to illegal memory access
-            # We will develop dynamic limits in future.
-            if num_blocks_local > 40000:
-                logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
-                num_blocks_local = min(40000, num_blocks_local)
             logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
             logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
 
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index 917aee09a0..2874ea1bf7 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -666,11 +666,6 @@ class PaddleDisWorkerProc:
             # 2. Calculate the appropriate number of blocks
             model_block_memory_used = self.worker.cal_theortical_kvcache()
             num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
-            # NOTE(liuzichang): Too many block will lead to illegal memory access
-            # We will develop dynamic limits in future.
-            if num_blocks_local > 40000:
-                logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
-                num_blocks_local = min(40000, num_blocks_local)
             logger.info(f"------- model_block_memory_used:{model_block_memory_used / 1024**3} GB --------")
             logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
 
diff --git a/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py b/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py
index b002ecf7ce..1b9ff70afc 100644
--- a/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py
+++ b/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py
@@ -281,7 +281,7 @@ def test_non_thinking_prompt(api_url, headers):
 def test_profile_reset_block_num():
     """测试profile reset_block_num功能，与baseline diff不能超过5%"""
     log_file = "./log/config.log"
-    baseline = 40000
+    baseline = 74000
 
     if not os.path.exists(log_file):
         pytest.fail(f"Log file not found: {log_file}")
diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py
index 778e192b40..83290ad710 100644
--- a/tests/e2e/test_EB_VL_Lite_serving.py
+++ b/tests/e2e/test_EB_VL_Lite_serving.py
@@ -736,7 +736,7 @@ def test_profile_reset_block_num():
     """测试profile reset_block_num功能，与baseline diff不能超过5%"""
     log_dir = os.getenv("FD_LOG_DIR", "log")
     log_file = os.path.join(log_dir, "config.log")
-    baseline = 40000
+    baseline = 65400
 
     if not os.path.exists(log_file):
         pytest.fail(f"Log file not found: {log_file}")