mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 17:11:21 +08:00
[Feature] support pooling model dummy_run (#4345)
* support qwen3-embedding * fix ci bug * support pooling dummy_run * fix * delete print * parallel_config.max_model_len * delete is_pooling_model in dummy_run * fix * fd_model * fix embedding load * fix * fix post_process
This commit is contained in:
@@ -69,8 +69,9 @@ def build_pooling_cursor(num_scheduled_tokens: list[int], prompt_lens: paddle.Te
|
||||
|
||||
n_seq = len(num_scheduled_tokens)
|
||||
index = list(range(n_seq))
|
||||
num_scheduled_tokens = paddle.to_tensor(num_scheduled_tokens, device="cpu")
|
||||
cumsum = paddle.zeros([n_seq + 1], dtype="int64", place=paddle.CPUPlace())
|
||||
num_scheduled_tokens = paddle.to_tensor(num_scheduled_tokens)
|
||||
cumsum = paddle.zeros([n_seq + 1], dtype="int64")
|
||||
|
||||
paddle.cumsum(num_scheduled_tokens, axis=0, out=cumsum[1:])
|
||||
if device == "gpu":
|
||||
cumsum_device = cumsum.cuda()
|
||||
|
||||
Reference in New Issue
Block a user