mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
Fix dummy run when use PD Disaggregation with EP inference. (#5112)
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”>
This commit is contained in:
@@ -1926,7 +1926,12 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
else:
|
||||
assert batch_size % 2 == 0
|
||||
self._dummy_run(
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
num_tokens=(
|
||||
self.scheduler_config.max_num_seqs
|
||||
* (self.speculative_config.num_speculative_tokens + 1)
|
||||
if self.scheduler_config.splitwise_role == "decode"
|
||||
else self.scheduler_config.max_num_batched_tokens
|
||||
),
|
||||
batch_size=int(batch_size / 2),
|
||||
in_capturing=True,
|
||||
expected_decode_len=1,
|
||||
@@ -1943,7 +1948,11 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
else:
|
||||
assert batch_size % 2 == 0
|
||||
self._dummy_run(
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
num_tokens=(
|
||||
self.scheduler_config.max_num_seqs
|
||||
if self.scheduler_config.splitwise_role == "decode"
|
||||
else self.scheduler_config.max_num_batched_tokens
|
||||
),
|
||||
batch_size=int(batch_size / 2),
|
||||
in_capturing=True,
|
||||
expected_decode_len=3,
|
||||
@@ -1955,7 +1964,11 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# Capture Draft Model with bsz 1
|
||||
if 1 in capture_sizes:
|
||||
self._dummy_run(
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
num_tokens=(
|
||||
self.scheduler_config.max_num_seqs
|
||||
if self.scheduler_config.splitwise_role == "decode"
|
||||
else self.scheduler_config.max_num_batched_tokens
|
||||
),
|
||||
batch_size=int(1),
|
||||
in_capturing=True,
|
||||
expected_decode_len=3,
|
||||
@@ -1968,7 +1981,11 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
else:
|
||||
for batch_size in sorted(capture_sizes, reverse=True):
|
||||
self._dummy_run(
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
num_tokens=(
|
||||
self.scheduler_config.max_num_seqs
|
||||
if self.scheduler_config.splitwise_role == "decode"
|
||||
else self.scheduler_config.max_num_batched_tokens
|
||||
),
|
||||
batch_size=batch_size,
|
||||
in_capturing=True,
|
||||
expected_decode_len=expected_decode_len,
|
||||
@@ -2001,7 +2018,11 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
start_time = time.perf_counter()
|
||||
for batch_size in self.sot_warmup_sizes:
|
||||
self._dummy_run(
|
||||
num_tokens=self.scheduler_config.max_num_batched_tokens,
|
||||
num_tokens=(
|
||||
self.scheduler_config.max_num_seqs
|
||||
if self.scheduler_config.splitwise_role == "decode"
|
||||
else self.scheduler_config.max_num_batched_tokens
|
||||
),
|
||||
batch_size=batch_size,
|
||||
)
|
||||
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||
|
||||
Reference in New Issue
Block a user