[BugFix] fix num_cpu_blocks computation (#6438)

* [BugFix] fix num_cpu_blocks computation

* [fix] fix syntax and log

* [fix] pre-commit

* [fix] use getattr

* [fix] ci test
This commit is contained in:
Yonghua Li
2026-02-13 11:05:14 +08:00
committed by GitHub
parent 52edf5e9b3
commit e2332a1112
9 changed files with 162 additions and 63 deletions
+1 -7
View File
@@ -1533,12 +1533,6 @@ class GPUModelRunner(ModelRunnerBase):
logger.info(f"Initializing kv cache for all layers. {cache_ready_signal.value}")
cache_kvs_list = []
# NOTE:(changwenbin) Determine whether it is Multi-Head Latent Attention,
# To rationalize the allocation of kvcache.
from fastdeploy import envs
self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
for i in range(self.model_config.num_hidden_layers):
# init key cache
key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}"
@@ -2748,7 +2742,7 @@ class GPUModelRunner(ModelRunnerBase):
# NOTE:(changwenbin) Determie whether it is Multi-Head Latent Attention,
# To rationalize the allocation of kvcache.
if self.mla_cache:
if self.fd_config.cache_config.use_mla_cache:
required_memory = (
byte_of_dtype
* (self.fd_config.model_config.kv_lora_rank + self.fd_config.model_config.qk_rope_head_dim)