mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Speculative Decoding] Support mtp super ultra overlap in pd-split mode with insert_task overlap (#7323)
* support mtp overlap in pd-split mode with insert_task overlap
This commit is contained in:
@@ -24,8 +24,23 @@ import numpy as np
|
||||
import paddle
|
||||
|
||||
try:
|
||||
from cuda import cudart
|
||||
except ImportError:
|
||||
import cuda as _cuda_pkg
|
||||
|
||||
_cuda_ver = getattr(_cuda_pkg, "__version__", None)
|
||||
if _cuda_ver is None:
|
||||
# cuda-python >= 13.x 无顶层 __version__,通过 cuda-bindings 子包判断
|
||||
import importlib.metadata as _meta
|
||||
|
||||
_cuda_ver = _meta.version("cuda-bindings")
|
||||
_cuda_major = int(_cuda_ver.split(".")[0])
|
||||
if _cuda_major >= 13:
|
||||
from cuda.bindings import runtime as cudart
|
||||
else:
|
||||
from cuda import cudart
|
||||
except Exception as _e:
|
||||
import warnings
|
||||
|
||||
warnings.warn(f"cuda-python import failed, async_expert_loader will be unavailable: {_e}")
|
||||
cudart = None
|
||||
|
||||
from fastdeploy.config import EPLBConfig
|
||||
@@ -98,6 +113,7 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, epl
|
||||
raise ImportError(
|
||||
"cuda-python not installed. Install the version matching your CUDA toolkit:\n"
|
||||
" CUDA 12.x → pip install cuda-python==12.*\n"
|
||||
" CUDA 13.x → pip install cuda-python cuda-bindings\n"
|
||||
)
|
||||
|
||||
# Register memory with CUDA
|
||||
|
||||
@@ -116,36 +116,33 @@ from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, SamplerOu
|
||||
|
||||
DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"
|
||||
|
||||
if current_platform.is_cuda():
|
||||
|
||||
def async_set_value(tgt, src):
|
||||
if isinstance(src, (int, float, bool)):
|
||||
src = paddle.full(tgt.shape, fill_value=src, dtype=tgt.dtype)
|
||||
elif isinstance(src, (list, np.array)):
|
||||
dtype_str = str(tgt.dtype).split(".")[1]
|
||||
if isinstance(src, list):
|
||||
src = np.array(src, dtype=dtype_str if dtype_str != "bfloat16" else "float32")
|
||||
def async_set_value(tgt, src):
|
||||
if isinstance(src, (int, float, bool)):
|
||||
src = paddle.full(tgt.shape, fill_value=src, dtype=tgt.dtype)
|
||||
elif isinstance(src, (list, np.ndarray)):
|
||||
dtype_str = str(tgt.dtype).split(".")[1]
|
||||
if isinstance(src, list):
|
||||
src = np.array(src, dtype=dtype_str if dtype_str != "bfloat16" else "float32")
|
||||
if current_platform.is_cuda():
|
||||
if str(src.dtype) != dtype_str:
|
||||
srt_tensor = paddle.empty(tgt.shape, dtype=str(src.dtype))
|
||||
src = custom_numpy_to_tensor(src, srt_tensor)
|
||||
else:
|
||||
return custom_numpy_to_tensor(src, tgt)
|
||||
elif isinstance(src, paddle.Tensor):
|
||||
pass
|
||||
else:
|
||||
raise ValueError("async_set_value unsupported src type: {}".format(type(src)))
|
||||
if src.shape != tgt.shape:
|
||||
src = src.reshape(tgt.shape)
|
||||
if src.dtype != tgt.dtype:
|
||||
src = src.cast(tgt.dtype)
|
||||
if src.place != tgt.place:
|
||||
src = src.to(tgt.place)
|
||||
tgt.copy_(src, blocking=False)
|
||||
|
||||
else:
|
||||
|
||||
def async_set_value(*args, **kwargs):
|
||||
raise RuntimeError("async_set_value is only available on CUDA")
|
||||
src = paddle.to_tensor(src, dtype=tgt.dtype)
|
||||
elif isinstance(src, paddle.Tensor):
|
||||
pass
|
||||
else:
|
||||
raise ValueError("async_set_value unsupported src type: {}".format(type(src)))
|
||||
if src.shape != tgt.shape:
|
||||
src = src.reshape(tgt.shape)
|
||||
if src.dtype != tgt.dtype:
|
||||
src = src.cast(tgt.dtype)
|
||||
if src.place != tgt.place:
|
||||
src = src.to(tgt.place)
|
||||
tgt.copy_(src, blocking=False)
|
||||
|
||||
|
||||
def pre_process(
|
||||
|
||||
@@ -55,6 +55,29 @@ if current_platform.is_xpu():
|
||||
DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"
|
||||
|
||||
|
||||
def async_set_value(tgt, src):
|
||||
if isinstance(src, (int, float, bool)):
|
||||
src = paddle.full(tgt.shape, fill_value=src, dtype=tgt.dtype)
|
||||
elif isinstance(src, (list, np.ndarray)):
|
||||
dtype_str = str(tgt.dtype).split(".")[1]
|
||||
np_dtype = dtype_str if dtype_str != "bfloat16" else "float32"
|
||||
if isinstance(src, list):
|
||||
src = np.array(src, dtype=np_dtype)
|
||||
# TODO: support async_numpy_to_tensor
|
||||
src = paddle.to_tensor(src, dtype=tgt.dtype)
|
||||
elif isinstance(src, paddle.Tensor):
|
||||
pass
|
||||
else:
|
||||
raise ValueError("async_set_value unsupported src type: {}".format(type(src)))
|
||||
if src.shape != tgt.shape:
|
||||
src = src.reshape(tgt.shape)
|
||||
if src.dtype != tgt.dtype:
|
||||
src = src.cast(tgt.dtype)
|
||||
if src.place != tgt.place:
|
||||
src = src.to(tgt.place)
|
||||
tgt.copy_(src, blocking=False)
|
||||
|
||||
|
||||
def _build_stream_transfer_data(
|
||||
output_tokens: paddle.Tensor,
|
||||
pooler_outputs: List = None,
|
||||
|
||||
@@ -49,7 +49,10 @@ if current_platform.is_xpu():
|
||||
share_external_data,
|
||||
update_attn_mask_offsets,
|
||||
)
|
||||
|
||||
# temporary solution
|
||||
from fastdeploy.model_executor.xpu_pre_and_post_process import (
|
||||
async_set_value,
|
||||
xpu_pre_process,
|
||||
xpu_process_output,
|
||||
)
|
||||
@@ -483,28 +486,32 @@ class MTPProposer(Proposer):
|
||||
input_ids = request.prompt_token_ids + request.output_token_ids
|
||||
|
||||
self.model_inputs["input_ids_len"][idx] = length - 1
|
||||
self.model_inputs["pre_ids"][idx : idx + 1] = -1
|
||||
async_set_value(self.model_inputs["pre_ids"][idx : idx + 1], -1)
|
||||
self.model_inputs["input_ids"][idx : idx + 1, : length - 1] = self.target_model_inputs["input_ids"][
|
||||
idx : idx + 1, 1:length
|
||||
]
|
||||
self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = self.target_model_inputs[
|
||||
"input_ids"
|
||||
][idx : idx + 1, 1:length].cpu()
|
||||
# TODO: use token_all_ids replace with input_ids_cpu
|
||||
if getattr(self, "hybrid_mode", False) and "input_ids_cpu" in self.model_inputs:
|
||||
self.model_inputs["input_ids_cpu"][idx : idx + 1, : length - 1] = self.target_model_inputs[
|
||||
"input_ids"
|
||||
][idx : idx + 1, 1:length].cpu()
|
||||
encoder_block_num = len(request.block_tables)
|
||||
self.model_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
|
||||
self.model_inputs["block_tables"][idx : idx + 1, :] = -1
|
||||
self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
|
||||
request.block_tables, dtype="int32"
|
||||
async_set_value(self.model_inputs["encoder_block_lens"][idx : idx + 1], encoder_block_num)
|
||||
async_set_value(self.model_inputs["block_tables"][idx : idx + 1, :], -1)
|
||||
async_set_value(
|
||||
self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num], request.block_tables
|
||||
)
|
||||
self.model_inputs["stop_flags"][idx : idx + 1] = False
|
||||
self.model_inputs["batch_drop"][idx : idx + 1] = False
|
||||
|
||||
self.model_inputs["seq_lens_encoder"][idx : idx + 1] = length
|
||||
async_set_value(self.model_inputs["stop_flags"][idx : idx + 1], False)
|
||||
async_set_value(self.model_inputs["batch_drop"][idx : idx + 1], False)
|
||||
|
||||
async_set_value(self.model_inputs["seq_lens_encoder"][idx : idx + 1], length)
|
||||
self.exist_prefill_flag = True
|
||||
self.model_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
|
||||
self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = length
|
||||
self.model_inputs["step_idx"][idx : idx + 1] = (
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
|
||||
async_set_value(self.model_inputs["seq_lens_decoder"][idx : idx + 1], prefill_start_index)
|
||||
async_set_value(self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1], length)
|
||||
async_set_value(
|
||||
self.model_inputs["step_idx"][idx : idx + 1],
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0,
|
||||
)
|
||||
if self.use_attn_mask_offset:
|
||||
inputs = request.multimodal_inputs
|
||||
@@ -522,18 +529,19 @@ class MTPProposer(Proposer):
|
||||
if (
|
||||
self.fd_config.scheduler_config.splitwise_role == "decode"
|
||||
): # In PD, we continue to decode after P generates first token
|
||||
self.model_inputs["seq_lens_encoder"][idx : idx + 1] = 0
|
||||
async_set_value(self.model_inputs["seq_lens_encoder"][idx : idx + 1], 0)
|
||||
self.exist_prefill_flag = False
|
||||
self.model_inputs["recompute_token_num"][idx : idx + 1] = 0
|
||||
self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = length + 1
|
||||
async_set_value(self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1], length + 1)
|
||||
# NOTE(liuzichang):
|
||||
# extra 1 : P-D split need rollback one step
|
||||
self.model_inputs["mask_rollback"][idx : idx + 1] = 1
|
||||
|
||||
async_set_value(self.model_inputs["recompute_token_num"][idx : idx + 1], 0)
|
||||
async_set_value(self.model_inputs["mask_rollback"][idx : idx + 1], 1)
|
||||
# has_prefill_task = True
|
||||
elif request.task_type.value == RequestType.DECODE.value: # decode task
|
||||
encoder_block_num = len(request.block_tables)
|
||||
self.model_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
|
||||
self.model_inputs["block_tables"][idx : idx + 1, :] = -1
|
||||
async_set_value(self.model_inputs["encoder_block_lens"][idx : idx + 1], encoder_block_num)
|
||||
async_set_value(self.model_inputs["block_tables"][idx : idx + 1, :], -1)
|
||||
if current_platform.is_cuda():
|
||||
async_set_value(
|
||||
self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num], request.block_tables
|
||||
@@ -542,16 +550,13 @@ class MTPProposer(Proposer):
|
||||
self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
|
||||
request.block_tables, dtype="int32"
|
||||
)
|
||||
# if self.model_inputs["is_block_step"][idx]: # has tasks to continue to decode
|
||||
# has_decode_task = True
|
||||
# continue
|
||||
else:
|
||||
self.model_inputs["block_tables"][idx : idx + 1, :] = -1
|
||||
self.model_inputs["stop_flags"][idx : idx + 1] = True
|
||||
self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = 0
|
||||
self.model_inputs["seq_lens_decoder"][idx : idx + 1] = 0
|
||||
self.model_inputs["seq_lens_encoder"][idx : idx + 1] = 0
|
||||
self.model_inputs["is_block_step"][idx : idx + 1] = False
|
||||
async_set_value(self.model_inputs["block_tables"][idx : idx + 1, :], -1)
|
||||
async_set_value(self.model_inputs["stop_flags"][idx : idx + 1], True)
|
||||
async_set_value(self.model_inputs["seq_lens_this_time_buffer"][idx : idx + 1], 0)
|
||||
async_set_value(self.model_inputs["seq_lens_decoder"][idx : idx + 1], 0)
|
||||
async_set_value(self.model_inputs["seq_lens_encoder"][idx : idx + 1], 0)
|
||||
async_set_value(self.model_inputs["is_block_step"][idx : idx + 1], False)
|
||||
continue
|
||||
|
||||
# TODO(liuzichang): Solve splitewise-p bug to restore
|
||||
@@ -1233,6 +1238,7 @@ class MTPProposer(Proposer):
|
||||
)
|
||||
|
||||
def _extend_draft_token_with_ngram_match(self):
|
||||
# TODO: replace with gpu tensor
|
||||
hybrid_mtp_ngram(
|
||||
self.model_inputs["input_ids_cpu"].cuda(),
|
||||
self.model_inputs["input_ids_len"].cuda(),
|
||||
|
||||
@@ -818,9 +818,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
input_ids = prompt_token_ids + request.output_token_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
# prompt_tokens
|
||||
self.share_inputs["token_ids_all"][idx : idx + 1, :prompt_len] = np.array(
|
||||
prompt_token_ids, dtype="int64"
|
||||
)
|
||||
async_set_value(self.share_inputs["token_ids_all"][idx : idx + 1, :prompt_len], prompt_token_ids)
|
||||
# generated_token_ids fill -1
|
||||
self.share_inputs["token_ids_all"][idx : idx + 1, prompt_len:] = -1
|
||||
|
||||
@@ -830,33 +828,39 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.deterministic_logger.log_prefill_input(
|
||||
request.request_id, idx, prefill_start_index, prefill_end_index, input_ids
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Handle prefill request {request} at idx {idx}, "
|
||||
f"{prefill_start_index=}, {prefill_end_index=}, "
|
||||
f"need_prefilled_token_num={len(input_ids)}"
|
||||
f"prompt_len={prompt_len}"
|
||||
)
|
||||
self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(
|
||||
input_ids[prefill_start_index:prefill_end_index]
|
||||
async_set_value(
|
||||
self.share_inputs["input_ids"][idx : idx + 1, :length],
|
||||
input_ids[prefill_start_index:prefill_end_index],
|
||||
)
|
||||
encoder_block_num = len(request.block_tables)
|
||||
self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :] = -1
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
|
||||
request.block_tables, dtype="int32"
|
||||
async_set_value(self.share_inputs["encoder_block_lens"][idx : idx + 1], encoder_block_num)
|
||||
|
||||
async_set_value(self.share_inputs["block_tables"][idx : idx + 1, :], -1)
|
||||
|
||||
async_set_value(
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num], request.block_tables
|
||||
)
|
||||
self.share_inputs["stop_flags"][idx : idx + 1] = False
|
||||
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = prefill_start_index
|
||||
self.share_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = length
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length
|
||||
|
||||
async_set_value(self.share_inputs["stop_flags"][idx : idx + 1], False)
|
||||
|
||||
async_set_value(self.share_inputs["seq_lens_decoder"][idx : idx + 1], prefill_start_index)
|
||||
async_set_value(self.share_inputs["seq_lens_this_time_buffer"][idx : idx + 1], length)
|
||||
async_set_value(self.share_inputs["seq_lens_encoder"][idx : idx + 1], length)
|
||||
self.exist_prefill_flag = True
|
||||
self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = 0
|
||||
self.share_inputs["prompt_lens"][idx : idx + 1] = len(input_ids)
|
||||
self.share_inputs["is_block_step"][idx : idx + 1] = False
|
||||
async_set_value(self.share_inputs["step_seq_lens_decoder"][idx : idx + 1], 0)
|
||||
async_set_value(self.share_inputs["prompt_lens"][idx : idx + 1], len(input_ids))
|
||||
|
||||
async_set_value(self.share_inputs["is_block_step"][idx : idx + 1], False)
|
||||
self.share_inputs["is_chunk_step"][idx : idx + 1] = prefill_end_index < len(input_ids)
|
||||
self.share_inputs["step_idx"][idx : idx + 1] = (
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
|
||||
async_set_value(
|
||||
self.share_inputs["step_idx"][idx : idx + 1],
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0,
|
||||
)
|
||||
# pooling model request.sampling_params is None
|
||||
if request.sampling_params is not None and request.sampling_params.prompt_logprobs is not None:
|
||||
@@ -878,21 +882,37 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
if (
|
||||
self.fd_config.scheduler_config.splitwise_role == "decode"
|
||||
): # In PD, we continue to decode after P generate first token
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
|
||||
# TODO: delete useless operation like this
|
||||
async_set_value(self.share_inputs["seq_lens_encoder"][idx : idx + 1], 0)
|
||||
self.exist_prefill_flag = False
|
||||
self._cached_launch_token_num = -1
|
||||
if self.speculative_decoding:
|
||||
# D speculate decode, seq_lens_this_time = length + 1
|
||||
self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length + 1
|
||||
self.share_inputs["draft_tokens"][idx : idx + 1, 0 : length + 1] = paddle.to_tensor(
|
||||
request.draft_token_ids[0 : length + 1],
|
||||
dtype="int64",
|
||||
if self._cached_launch_token_num != -1:
|
||||
token_num_one_step = (
|
||||
(self.speculative_config.num_speculative_tokens + 1) if self.speculative_decoding else 1
|
||||
)
|
||||
self._cached_launch_token_num += token_num_one_step
|
||||
self._cached_real_bsz += 1
|
||||
if self.speculative_decoding:
|
||||
# D first decode step, [Target first token, MTP first draft token]
|
||||
# MTP in P only generate one draft token in any num_model_step config
|
||||
draft_tokens_to_write = request.draft_token_ids[0:2]
|
||||
if len(draft_tokens_to_write) != 2:
|
||||
raise ValueError(
|
||||
"Expected at least 2 draft tokens for speculative suffix decode, "
|
||||
f"but got {len(draft_tokens_to_write)} for request {request.request_id}."
|
||||
)
|
||||
async_set_value(
|
||||
self.share_inputs["draft_tokens"][idx : idx + 1, 0:2],
|
||||
draft_tokens_to_write,
|
||||
)
|
||||
async_set_value(self.share_inputs["seq_lens_this_time_buffer"][idx : idx + 1], 2)
|
||||
logger.debug(
|
||||
f"insert request {request.request_id} idx: {idx} suffix tokens {request.draft_token_ids}"
|
||||
)
|
||||
elif request.task_type.value == RequestType.DECODE.value: # decode task
|
||||
logger.debug(f"Handle decode request {request} at idx {idx}")
|
||||
encoder_block_num = len(request.block_tables)
|
||||
self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :] = -1
|
||||
async_set_value(self.share_inputs["encoder_block_lens"][idx : idx + 1], encoder_block_num)
|
||||
async_set_value(self.share_inputs["block_tables"][idx : idx + 1, :], -1)
|
||||
if current_platform.is_cuda():
|
||||
async_set_value(
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num], request.block_tables
|
||||
@@ -901,6 +921,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
|
||||
request.block_tables, dtype="int32"
|
||||
)
|
||||
# CPU Tensor
|
||||
self.share_inputs["preempted_idx"][idx : idx + 1, :] = 0
|
||||
continue
|
||||
else: # preempted task
|
||||
@@ -909,12 +930,12 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
elif request.task_type.value == RequestType.ABORT.value:
|
||||
logger.info(f"Handle abort request {request} at idx {idx}")
|
||||
self.share_inputs["preempted_idx"][idx : idx + 1, :] = 1
|
||||
self.share_inputs["block_tables"][idx : idx + 1, :] = -1
|
||||
self.share_inputs["stop_flags"][idx : idx + 1] = True
|
||||
self.share_inputs["seq_lens_this_time_buffer"][idx : idx + 1] = 0
|
||||
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
|
||||
self.share_inputs["is_block_step"][idx : idx + 1] = False
|
||||
async_set_value(self.share_inputs["block_tables"][idx : idx + 1, :], -1)
|
||||
async_set_value(self.share_inputs["stop_flags"][idx : idx + 1], True)
|
||||
async_set_value(self.share_inputs["seq_lens_this_time_buffer"][idx : idx + 1], 0)
|
||||
async_set_value(self.share_inputs["seq_lens_decoder"][idx : idx + 1], 0)
|
||||
async_set_value(self.share_inputs["seq_lens_encoder"][idx : idx + 1], 0)
|
||||
async_set_value(self.share_inputs["is_block_step"][idx : idx + 1], False)
|
||||
self.prompt_logprobs_reqs.pop(request.request_id, None)
|
||||
self.in_progress_prompt_logprobs.pop(request.request_id, None)
|
||||
self.forward_batch_reqs_list[idx] = None
|
||||
@@ -926,53 +947,61 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
continue
|
||||
|
||||
assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
|
||||
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
||||
|
||||
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
|
||||
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
||||
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
|
||||
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
||||
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
|
||||
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
|
||||
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
||||
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
||||
self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0)
|
||||
self.share_inputs["temp_scaled_logprobs"][idx : idx + 1] = request.get("temp_scaled_logprobs", False)
|
||||
self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1] = request.get(
|
||||
"top_p_normalized_logprobs", False
|
||||
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
|
||||
async_set_value(self.share_inputs["eos_token_id"][:], request.eos_token_ids)
|
||||
async_set_value(self.share_inputs["top_p"][idx : idx + 1], request.get("top_p", 0.7))
|
||||
async_set_value(self.share_inputs["top_k"][idx : idx + 1], request.get("top_k", 0))
|
||||
async_set_value(self.share_inputs["min_p"][idx : idx + 1], request.get("min_p", 0.0))
|
||||
async_set_value(self.share_inputs["temperature"][idx : idx + 1], request.get("temperature", 0.95))
|
||||
async_set_value(self.share_inputs["penalty_score"][idx : idx + 1], request.get("repetition_penalty", 1.0))
|
||||
async_set_value(self.share_inputs["frequency_score"][idx : idx + 1], request.get("frequency_penalty", 0.0))
|
||||
async_set_value(self.share_inputs["presence_score"][idx : idx + 1], request.get("presence_penalty", 0.0))
|
||||
async_set_value(
|
||||
self.share_inputs["temp_scaled_logprobs"][idx : idx + 1], request.get("temp_scaled_logprobs", False)
|
||||
)
|
||||
self.share_inputs["generated_modality"][idx : idx + 1] = request.get("generated_modality", 0)
|
||||
|
||||
self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1)
|
||||
self.share_inputs["max_dec_len"][idx : idx + 1] = request.get(
|
||||
"max_tokens", self.model_config.max_model_len
|
||||
async_set_value(
|
||||
self.share_inputs["top_p_normalized_logprobs"][idx : idx + 1],
|
||||
request.get("top_p_normalized_logprobs", False),
|
||||
)
|
||||
async_set_value(
|
||||
self.share_inputs["generated_modality"][idx : idx + 1], request.get("generated_modality", 0)
|
||||
)
|
||||
async_set_value(self.share_inputs["min_dec_len"][idx : idx + 1], request.get("min_tokens", 1))
|
||||
async_set_value(
|
||||
self.share_inputs["max_dec_len"][idx : idx + 1],
|
||||
request.get("max_tokens", self.model_config.max_model_len),
|
||||
)
|
||||
|
||||
if request.get("seed") is not None:
|
||||
self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed")
|
||||
async_set_value(self.share_inputs["infer_seed"][idx : idx + 1], request.get("seed"))
|
||||
|
||||
if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0:
|
||||
bad_words_len = len(request.get("bad_words_token_ids"))
|
||||
self.share_inputs["bad_tokens_len"][idx] = bad_words_len
|
||||
self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array(
|
||||
request.get("bad_words_token_ids"), dtype="int64"
|
||||
async_set_value(self.share_inputs["bad_tokens_len"][idx : idx + 1], bad_words_len)
|
||||
async_set_value(
|
||||
self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len], request.get("bad_words_token_ids")
|
||||
)
|
||||
else:
|
||||
self.share_inputs["bad_tokens_len"][idx] = 1
|
||||
self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64")
|
||||
async_set_value(self.share_inputs["bad_tokens_len"][idx : idx + 1], 1)
|
||||
async_set_value(self.share_inputs["bad_tokens"][idx : idx + 1, :], -1)
|
||||
|
||||
if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
|
||||
stop_seqs_num = len(request.get("stop_seqs_len"))
|
||||
for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num):
|
||||
request.sampling_params.stop_seqs_len.append(0)
|
||||
self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = np.array(
|
||||
request.sampling_params.stop_seqs_len, dtype="int32"
|
||||
async_set_value(
|
||||
self.share_inputs["stop_seqs_len"][idx : idx + 1, :], request.sampling_params.stop_seqs_len
|
||||
)
|
||||
self.share_inputs["stop_seqs"][
|
||||
idx : idx + 1, :stop_seqs_num, : len(request.get("stop_token_ids")[0])
|
||||
] = np.array(request.get("stop_token_ids"), dtype="int64")
|
||||
# 每条 stop sequence pad 到 stop_seqs_max_len,凑齐空行后整块写入
|
||||
# 避免对第 3 维做部分切片(非连续内存)导致 async_set_value stride 错位
|
||||
stop_token_ids = request.get("stop_token_ids")
|
||||
max_len = self.model_config.stop_seqs_max_len
|
||||
padded = [seq + [-1] * (max_len - len(seq)) for seq in stop_token_ids]
|
||||
padded.extend([[-1] * max_len] * (self.model_config.max_stop_seqs_num - stop_seqs_num))
|
||||
async_set_value(self.share_inputs["stop_seqs"][idx : idx + 1, :, :], padded)
|
||||
else:
|
||||
self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0
|
||||
async_set_value(self.share_inputs["stop_seqs_len"][idx : idx + 1, :], 0)
|
||||
|
||||
self.pooling_params = batch_pooling_params
|
||||
# For logits processors
|
||||
@@ -981,7 +1010,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.sampler.apply_logits_processor(idx, logits_info, prefill_tokens)
|
||||
|
||||
self._process_mm_features(req_dicts)
|
||||
if len(rope_3d_position_ids["position_ids_idx"]) > 0:
|
||||
|
||||
if len(rope_3d_position_ids["position_ids_idx"]) > 0 and self.enable_mm:
|
||||
packed_position_ids = paddle.to_tensor(
|
||||
np.concatenate(rope_3d_position_ids["position_ids_lst"]), dtype="int64"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user