[BugFix] fix num_cpu_blocks computation (#6438)

* [BugFix] fix num_cpu_blocks computation * [fix] fix syntax and log * [fix] pre-commit * [fix] use getattr * [fix] ci test
2026-04-23 00:17:25 +08:00 · 2026-02-13 11:05:14 +08:00
parent 52edf5e9b3
commit e2332a1112
9 changed files with 162 additions and 63 deletions
@@ -1533,12 +1533,6 @@ class GPUModelRunner(ModelRunnerBase):

        logger.info(f"Initializing kv cache for all layers. {cache_ready_signal.value}")
        cache_kvs_list = []
-
-        # NOTE:(changwenbin) Determine whether it is Multi-Head Latent Attention,
-        # To rationalize the allocation of kvcache.
-        from fastdeploy import envs
-
-        self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
        for i in range(self.model_config.num_hidden_layers):
            # init key cache
            key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}"
@@ -2748,7 +2742,7 @@ class GPUModelRunner(ModelRunnerBase):

        # NOTE:(changwenbin) Determie whether it is Multi-Head Latent Attention,
        # To rationalize the allocation of kvcache.
-        if self.mla_cache:
+        if self.fd_config.cache_config.use_mla_cache:
            required_memory = (
                byte_of_dtype
                * (self.fd_config.model_config.kv_lora_rank + self.fd_config.model_config.qk_rope_head_dim)