mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[BugFix] fix num_cpu_blocks computation (#6438)
* [BugFix] fix num_cpu_blocks computation * [fix] fix syntax and log * [fix] pre-commit * [fix] use getattr * [fix] ci test
This commit is contained in:
@@ -1533,12 +1533,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
logger.info(f"Initializing kv cache for all layers. {cache_ready_signal.value}")
|
||||
cache_kvs_list = []
|
||||
|
||||
# NOTE:(changwenbin) Determine whether it is Multi-Head Latent Attention,
|
||||
# To rationalize the allocation of kvcache.
|
||||
from fastdeploy import envs
|
||||
|
||||
self.mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
|
||||
for i in range(self.model_config.num_hidden_layers):
|
||||
# init key cache
|
||||
key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}"
|
||||
@@ -2748,7 +2742,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
# NOTE:(changwenbin) Determie whether it is Multi-Head Latent Attention,
|
||||
# To rationalize the allocation of kvcache.
|
||||
if self.mla_cache:
|
||||
if self.fd_config.cache_config.use_mla_cache:
|
||||
required_memory = (
|
||||
byte_of_dtype
|
||||
* (self.fd_config.model_config.kv_lora_rank + self.fd_config.model_config.qk_rope_head_dim)
|
||||
|
||||
Reference in New Issue
Block a user