[RL][CI] Support Async R3 And Add Accuracy Test (#5937)

* add bs1 r3 test case

* async put

* r3 test case 1.0

* success run eb5

* refine test case

* pre-commit

* add eb45 & glm testcase

* format code

* add p2pstore requirements

* support only last turn

* R3 use worker log

* refine code &fix ci bug

* refine error mesg

* fix empty input bug

* Success set acc ci of eb45 and glm45

* refine code

* fix bug
This commit is contained in:
RAM
2026-01-14 20:25:06 +08:00
committed by GitHub
parent 9373f373dc
commit b3f59fd9b5
9 changed files with 443 additions and 20 deletions
+4 -2
View File
@@ -644,14 +644,16 @@ class FusedMoE(nn.Layer):
"""
topk_ids_hookfunc = None
if self.enable_routing_replay:
if forward_meta is not None: # forward_meta is None when execute empty_input_forward
# When execute empty_input_forward forward_meta is None. When execute mtp layer routing_replay_table is None.
if forward_meta is not None and forward_meta.routing_replay_table is not None:
moe_layer_idx = self.layer_idx - self.fd_config.model_config.moe_layer_start_index
topk_ids_hookfunc = partial(
save_routing_to_buffer,
routing_replay_table=forward_meta.routing_replay_table,
batch_id_per_token=forward_meta.batch_id_per_token,
seq_lens_decoder=forward_meta.seq_lens_decoder,
cu_seqlens_q=forward_meta.cu_seqlens_q,
layer_idx=self.layer_idx,
layer_idx=moe_layer_idx,
tp_size=self.fd_config.parallel_config.tensor_parallel_size,
ep_size=self.fd_config.parallel_config.expert_parallel_size,
tp_group=self.fd_config.parallel_config.tp_group,