[PD Disaggregation] Support PD deployment of DeepSeekv3. (#5251)

* Support deepseekv3 cache transfer for PD deploy

* clean some log info

---------

Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
This commit is contained in:
K11OntheBoat
2025-12-02 14:11:50 +08:00
committed by GitHub
parent 117980dd4e
commit 2e1680838f
17 changed files with 620 additions and 400 deletions
+2 -2
View File
@@ -258,10 +258,10 @@ class FusedMoE(nn.Layer):
else:
SHARD_ID_TO_SHARDED_DIM = {"gate": 0, "down": 1, "up": 0}
if not param._is_initialized():
param.initialize()
if not (expert_id - self.expert_id_offset >= 0 and expert_id - self.expert_id_offset < self.num_local_experts):
return
if not param._is_initialized():
param.initialize()
weight_need_transpose = getattr(param, "weight_need_transpose", False)
if shard_id is None:
# 1.gate up fused in disk