Fix dummy run when use PD Disaggregation with EP inference. (#5112)
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
This commit is contained in:
K11OntheBoat
2025-11-18 21:09:30 +08:00
committed by GitHub
parent 7fdc920a01
commit 4a7739ec0b
+26 -5
View File
@@ -1926,7 +1926,12 @@ class GPUModelRunner(ModelRunnerBase):
else:
assert batch_size % 2 == 0
self._dummy_run(
num_tokens=self.scheduler_config.max_num_batched_tokens,
num_tokens=(
self.scheduler_config.max_num_seqs
* (self.speculative_config.num_speculative_tokens + 1)
if self.scheduler_config.splitwise_role == "decode"
else self.scheduler_config.max_num_batched_tokens
),
batch_size=int(batch_size / 2),
in_capturing=True,
expected_decode_len=1,
@@ -1943,7 +1948,11 @@ class GPUModelRunner(ModelRunnerBase):
else:
assert batch_size % 2 == 0
self._dummy_run(
num_tokens=self.scheduler_config.max_num_batched_tokens,
num_tokens=(
self.scheduler_config.max_num_seqs
if self.scheduler_config.splitwise_role == "decode"
else self.scheduler_config.max_num_batched_tokens
),
batch_size=int(batch_size / 2),
in_capturing=True,
expected_decode_len=3,
@@ -1955,7 +1964,11 @@ class GPUModelRunner(ModelRunnerBase):
# Capture Draft Model with bsz 1
if 1 in capture_sizes:
self._dummy_run(
num_tokens=self.scheduler_config.max_num_batched_tokens,
num_tokens=(
self.scheduler_config.max_num_seqs
if self.scheduler_config.splitwise_role == "decode"
else self.scheduler_config.max_num_batched_tokens
),
batch_size=int(1),
in_capturing=True,
expected_decode_len=3,
@@ -1968,7 +1981,11 @@ class GPUModelRunner(ModelRunnerBase):
else:
for batch_size in sorted(capture_sizes, reverse=True):
self._dummy_run(
num_tokens=self.scheduler_config.max_num_batched_tokens,
num_tokens=(
self.scheduler_config.max_num_seqs
if self.scheduler_config.splitwise_role == "decode"
else self.scheduler_config.max_num_batched_tokens
),
batch_size=batch_size,
in_capturing=True,
expected_decode_len=expected_decode_len,
@@ -2001,7 +2018,11 @@ class GPUModelRunner(ModelRunnerBase):
start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes:
self._dummy_run(
num_tokens=self.scheduler_config.max_num_batched_tokens,
num_tokens=(
self.scheduler_config.max_num_seqs
if self.scheduler_config.splitwise_role == "decode"
else self.scheduler_config.max_num_batched_tokens
),
batch_size=batch_size,
)
logger.info(f"SOT warmup the model with the batch size:{batch_size}")