From 0ddb6e461c4c45be0395466aa080cee57a6558ff Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Mon, 13 Apr 2026 22:07:41 +0800 Subject: [PATCH] =?UTF-8?q?[Optimization]=20=E7=A7=BB=E9=99=A4=20num=5Fblo?= =?UTF-8?q?cks=20=E4=B8=8A=E9=99=90=E9=99=90=E5=88=B6=20(#7241)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + fastdeploy/worker/iluvatar_worker.py | 5 ----- fastdeploy/worker/worker_process.py | 5 ----- tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py | 2 +- tests/e2e/test_EB_VL_Lite_serving.py | 2 +- 5 files changed, 3 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 2b35f3a83b..32c8f1a6cf 100644 --- a/.gitignore +++ b/.gitignore @@ -173,6 +173,7 @@ custom_ops/tmp* build .ccls-cache +.claude third_party diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index 44be900bb7..eed2169644 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -126,11 +126,6 @@ class IluvatarPaddleDisWorkerProc(PaddleDisWorkerProc): # 2. Calculate the appropriate number of blocks model_block_memory_used = self.worker.cal_theortical_kvcache() num_blocks_local = int(available_kv_cache_memory // model_block_memory_used) - # NOTE(liuzichang): Too many block will lead to illegal memory access - # We will develop dynamic limits in future. - if num_blocks_local > 40000: - logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000") - num_blocks_local = min(40000, num_blocks_local) logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------") logger.info(f"------- num_blocks_local:{num_blocks_local} --------") diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 917aee09a0..2874ea1bf7 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -666,11 +666,6 @@ class PaddleDisWorkerProc: # 2. Calculate the appropriate number of blocks model_block_memory_used = self.worker.cal_theortical_kvcache() num_blocks_local = int(available_kv_cache_memory // model_block_memory_used) - # NOTE(liuzichang): Too many block will lead to illegal memory access - # We will develop dynamic limits in future. - if num_blocks_local > 40000: - logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000") - num_blocks_local = min(40000, num_blocks_local) logger.info(f"------- model_block_memory_used:{model_block_memory_used / 1024**3} GB --------") logger.info(f"------- num_blocks_local:{num_blocks_local} --------") diff --git a/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py b/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py index b002ecf7ce..1b9ff70afc 100644 --- a/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py +++ b/tests/e2e/4cards_cases/test_Qwen3_30b_tp4.py @@ -281,7 +281,7 @@ def test_non_thinking_prompt(api_url, headers): def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" log_file = "./log/config.log" - baseline = 40000 + baseline = 74000 if not os.path.exists(log_file): pytest.fail(f"Log file not found: {log_file}") diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index 778e192b40..83290ad710 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -736,7 +736,7 @@ def test_profile_reset_block_num(): """测试profile reset_block_num功能,与baseline diff不能超过5%""" log_dir = os.getenv("FD_LOG_DIR", "log") log_file = os.path.join(log_dir, "config.log") - baseline = 40000 + baseline = 65400 if not os.path.exists(log_file): pytest.fail(f"Log file not found: {log_file}")