mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[OP][Feature] 统一 limit_thinking_content_length CUDA 算子,支持回复长度限制与注入序列 (#6493)
* Initial plan * Migrate PRs #6311, #6129, #6305 to develop and merge unit tests Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix * update * fix * fix ci * fix ci * Initial plan * test: add test_chat_with_response_max_tokens to test_EB_VL_Lite_serving.py Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * test: add disable-thinking case to test_chat_with_response_max_tokens Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * test: add both reasoning_max_tokens and response_max_tokens case Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix ci * fix ci * fix ci --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
This commit is contained in:
@@ -307,8 +307,7 @@ elif paddle.is_compiled_with_cuda():
|
||||
"gpu_ops/noaux_tc_redundant.cu",
|
||||
"gpu_ops/custom_all_reduce/all_reduce.cu",
|
||||
"gpu_ops/merge_prefill_decode_output.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v1.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v2.cu",
|
||||
"gpu_ops/limit_thinking_content_length.cu",
|
||||
"gpu_ops/update_attn_mask_offsets.cu",
|
||||
"gpu_ops/fused_neox_rope_embedding.cu",
|
||||
"gpu_ops/gelu_tanh.cu",
|
||||
@@ -559,8 +558,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
||||
"gpu_ops/text_image_index_out.cu",
|
||||
"gpu_ops/text_image_gather_scatter.cu",
|
||||
"gpu_ops/set_data_ipc.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v1.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v2.cu",
|
||||
"gpu_ops/limit_thinking_content_length.cu",
|
||||
"gpu_ops/recover_decode_task.cu",
|
||||
"gpu_ops/update_inputs_v1.cu",
|
||||
"gpu_ops/get_img_boundaries.cc",
|
||||
@@ -631,8 +629,7 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
|
||||
"gpu_ops/text_image_gather_scatter.cu",
|
||||
"gpu_ops/text_image_index_out.cu",
|
||||
"gpu_ops/get_position_ids_and_mask_encoder_batch.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v1.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v2.cu",
|
||||
"gpu_ops/limit_thinking_content_length.cu",
|
||||
"gpu_ops/update_attn_mask_offsets.cu",
|
||||
"gpu_ops/append_attn/mla_cache_kernel.cu",
|
||||
"gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu",
|
||||
|
||||
Reference in New Issue
Block a user