mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[XPU] support warmup with ep & remove apply_tp_fused_op (#6289)
This commit is contained in:
@@ -1364,6 +1364,12 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
"""
|
||||
max_dec_len = expected_decode_len + 1
|
||||
input_length = min(num_tokens // batch_size, self.model_config.max_model_len - max_dec_len)
|
||||
|
||||
# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
|
||||
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
|
||||
if self.fd_config.parallel_config.enable_expert_parallel:
|
||||
input_length = min(input_length, 32)
|
||||
|
||||
block_num = (
|
||||
input_length + self.cache_config.block_size - 1
|
||||
) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
|
||||
@@ -1710,7 +1716,11 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
self.initialize_kv_cache(profile=True)
|
||||
|
||||
self._dummy_run(
|
||||
num_tokens=int(self.scheduler_config.max_num_batched_tokens),
|
||||
num_tokens=(
|
||||
self.scheduler_config.max_num_seqs
|
||||
if self.scheduler_config.splitwise_role == "decode"
|
||||
else self.scheduler_config.max_num_batched_tokens
|
||||
),
|
||||
batch_size=min(self.scheduler_config.max_num_seqs, 1),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user