diff --git a/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml b/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml index 30a50170bd..a5bb750ba9 100644 --- a/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml +++ b/benchmarks/yaml/x1-64k-w4a8c8-tp4.yaml @@ -1,5 +1,5 @@ -reasoning-parser: ernie_x1 -tool_call_parser: ernie_x1 +reasoning-parser: ernie-x1 +tool_call_parser: ernie-x1 tensor_parallel_size: 4 max_model_len: 65536 max_num_seqs: 128 diff --git a/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml b/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml index 09236610af..4476a55a9f 100644 --- a/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml +++ b/benchmarks/yaml/x1-a3b-128k-wint8-h800-tp1.yaml @@ -1,7 +1,7 @@ tensor_parallel_size: 1 max_model_len: 131072 max_num_seqs: 32 -reasoning_parser: ernie_x1 -tool_call_parser: ernie_x1 +reasoning_parser: ernie-x1 +tool_call_parser: ernie-x1 load_choices: "default_v1" quantization: wint8 diff --git a/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md b/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md index 05328ff08f..a67be76fef 100644 --- a/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md +++ b/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ --tensor-parallel-size 1 \ --max-model-len 131072 \ --quantization wint8 \ - --reasoning-parser ernie_x1 \ - --tool-call-parser ernie_x1 \ + --reasoning-parser ernie-x1 \ + --tool-call-parser ernie-x1 \ --max-num-seqs 32 ``` - `--quantization`: Indicates the quantization strategy used by the model. Different quantization strategies will result in different performance and accuracy of the model. It could be one of `wint8` / `wint4` / `block_wise_fp8`(Hopper is needed). diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md index 378a80a7b6..c4c319f83a 100644 --- a/docs/usage/environment_variables.md +++ b/docs/usage/environment_variables.md @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use Machete for wint4 dense GEMM. "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"), - # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie-45-vl, \n\n\n for ernie-x1) "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # Timeout for cache_transfer_manager process exit diff --git a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md index 0dc0db5277..c2648ceb33 100644 --- a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md +++ b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Thinking.md @@ -33,8 +33,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ --tensor-parallel-size 1 \ --max-model-len 131072 \ --quantization wint8 \ - --reasoning-parser ernie_x1 \ - --tool-call-parser ernie_x1 \ + --reasoning-parser ernie-x1 \ + --tool-call-parser ernie-x1 \ --max-num-seqs 32 ``` 其中: diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md index f778735eeb..b0a162a8aa 100644 --- a/docs/zh/usage/environment_variables.md +++ b/docs/zh/usage/environment_variables.md @@ -80,7 +80,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # 是否使用 Machete 后端的 wint4 GEMM. "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"), - # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie-45-vl, \n\n\n for ernie-x1) "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # cache_transfer_manager 进程残留时退出等待超时时间 diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 1b846d8fc0..3036f530fc 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -197,7 +197,7 @@ class Request: guided_grammar=d.get("guided_grammar", None), structural_tag=d.get("structural_tag", None), guided_json_object=d.get("guided_json_object", None), - enable_thinking=d.get("enable_thinking", False), + enable_thinking=d.get("enable_thinking", None), reasoning_max_tokens=d.get("reasoning_max_tokens", None), trace_carrier=d.get("trace_carrier", {}), chat_template=d.get("chat_template", None), diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index b18fc51022..cf11ba8fff 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -621,7 +621,7 @@ class OpenAIServingChat: if output is not None and output.get("metrics") and output["metrics"].get("request_start_time"): work_process_metrics.e2e_request_latency.observe( - time.time() - output.get("metrics").get("request_start_time") + time.time() - data.get("metrics").get("request_start_time") ) message = ChatMessage( role="assistant", @@ -655,7 +655,7 @@ class OpenAIServingChat: finish_reason = "tool_calls" else: finish_reason = "length" - if output.get("error_msg") is not None and "Recover" in output["error_msg"]: + if data.get("error_msg") is not None and "Recover" in data["error_msg"]: finish_reason = "recover_stop" return ChatCompletionResponseChoice( diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py index d6ac8f81aa..906483f445 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -95,6 +95,7 @@ class ToolParserManager: Raise a KeyError exception if the name is not registered. """ + name = name.replace("_", "-") if name in cls.tool_parsers: return cls.tool_parsers[name] diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py index 131c17e6ab..1cb8c0ab71 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_45_vl_thinking_tool_parser.py @@ -44,7 +44,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import ( from fastdeploy.utils import data_processor_logger -@ToolParserManager.register_module("ernie_45-vl-thinking") +@ToolParserManager.register_module("ernie-45-vl-thinking") class Ernie45VLThinkingToolParser(ToolParser): """ Tool parser for Ernie model version 4.5.1. diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py index 14a784f174..8a14abee87 100644 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py @@ -44,7 +44,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import ( from fastdeploy.utils import data_processor_logger -@ToolParserManager.register_module("ernie_x1") +@ToolParserManager.register_module("ernie-x1") class ErnieX1ToolParser(ToolParser): """ Tool parser for Ernie model version 4.5.1. diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 05b042d7a9..d60750d6a9 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -122,7 +122,7 @@ environment_variables: dict[str, Callable[[], Any]] = { "FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")), # enable return text, used when FD_ENABLE_INTERNAL_ADAPTER=1 "FD_ENABLE_RETURN_TEXT": lambda: bool(int(os.getenv("FD_ENABLE_RETURN_TEXT", "0"))), - # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie-45-vl, \n\n\n for ernie-x1) "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # Timeout for cache_transfer_manager process exit "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")), diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py index e0daacdc6a..a151dbfdd6 100644 --- a/fastdeploy/input/ernie4_5_processor.py +++ b/fastdeploy/input/ernie4_5_processor.py @@ -130,7 +130,7 @@ class Ernie4_5Processor(BaseDataProcessor): if chat_template_kwargs: if isinstance(chat_template_kwargs, dict): for k, v in chat_template_kwargs.items(): - if k not in task: + if k not in task or task[k] is None: task[k] = v else: raise ValueError("Invalid input: chat_template_kwargs must be a dict") diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 75e068c400..45ce5dda25 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -245,7 +245,7 @@ class DataProcessor(BaseDataProcessor): if chat_template_kwargs: if isinstance(chat_template_kwargs, dict): for k, v in chat_template_kwargs.items(): - if k not in task: + if k not in task or task[k] is None: task[k] = v else: raise ValueError("Invalid input: chat_template_kwargs must be a dict") diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index bddb12b496..bcbd25dbf1 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -101,7 +101,7 @@ def limit_thinking_content_length( line_break_id: int = None, ): if limit_strategy == "": - # for ernie4_5_vl + # for ernie-45-vl limit_thinking_content_length_v1( sampled_token_ids, max_think_lens, @@ -110,7 +110,7 @@ def limit_thinking_content_length( think_end_id, ) elif limit_strategy == "\n\n\n": - # for ernie_x1 + # for ernie-x1 assert line_break_id > 0 limit_thinking_content_length_v2( sampled_token_ids, @@ -136,7 +136,7 @@ def speculate_limit_thinking_content_length( line_break_id: int = None, ): if limit_strategy == "": - # for ernie4_5_vl + # for ernie-45-vl speculate_limit_thinking_content_length_v1( accept_tokens, max_think_lens, @@ -147,7 +147,7 @@ def speculate_limit_thinking_content_length( think_end_id, ) elif limit_strategy == "\n\n\n": - # for ernie_x1 + # for ernie-x1 assert line_break_id > 0 speculate_limit_thinking_content_length_v2( accept_tokens, diff --git a/fastdeploy/reasoning/abs_reasoning_parsers.py b/fastdeploy/reasoning/abs_reasoning_parsers.py index 50e01e5a9f..0f3e6e3183 100644 --- a/fastdeploy/reasoning/abs_reasoning_parsers.py +++ b/fastdeploy/reasoning/abs_reasoning_parsers.py @@ -125,6 +125,7 @@ class ReasoningParserManager: Raise a KeyError exception if the name is not registered. """ + name = name.replace("_", "-") if name in cls.reasoning_parsers: return cls.reasoning_parsers[name] diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py index 54b72a0eb5..77fc1d5ada 100644 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py @@ -5,10 +5,10 @@ from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaM from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager -@ReasoningParserManager.register_module("ernie_x1") +@ReasoningParserManager.register_module("ernie-x1") class ErnieX1ReasoningParser(ReasoningParser): """ - Reasoning parser for ernie_x1 model with stricter boundary checking. + Reasoning parser for ernie-x1 model with stricter boundary checking. Unified rules: - Do not strip newline before diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index f9dad431e7..58d7ff8a45 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -203,7 +203,7 @@ def xpu_post_process( step_idx = share_inputs["step_idx"] limit_think_status = share_inputs["limit_think_status"] if limit_strategy == "": - # for ernie4_5_vl + # for ernie-45-vl limit_thinking_content_length_v1( sampled_token_ids, max_think_lens, @@ -212,7 +212,7 @@ def xpu_post_process( think_end_id, ) elif limit_strategy == "\n\n\n": - # for ernie_x1 + # for ernie-x1 assert line_break_id > 0 limit_thinking_content_length_v2( sampled_token_ids, diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index 01b6346e03..3a772f9193 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -412,7 +412,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): "test_data": { "request_id": "test_1", "outputs": { - "token_ids": [789], + "token_ids": [123, 456, 789], "text": "Edge case response", "reasoning_content": None, "tool_call": None, @@ -424,7 +424,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): "previous_num_tokens": 1, }, "mock_request": ChatCompletionRequest( - model="test", messages=[], return_token_ids=True, max_tokens=5, n=2 + model="test", messages=[], return_token_ids=True, max_tokens=1, n=2 ), "expected": { "index": 1, @@ -434,7 +434,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): "raw_prediction": None, "num_cached_tokens": 0, "num_image_tokens": 0, - "finish_reason": "stop", + "finish_reason": "length", }, }, ] diff --git a/tests/entrypoints/openai/test_serving_completion.py b/tests/entrypoints/openai/test_serving_completion.py index d48ce4d5b7..c2a36d1855 100644 --- a/tests/entrypoints/openai/test_serving_completion.py +++ b/tests/entrypoints/openai/test_serving_completion.py @@ -73,9 +73,9 @@ class TestOpenAIServingCompletion(unittest.TestCase): self.assertTrue(serving_completion._check_master()) def test_calc_finish_reason_tool_calls(self): - # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1" + # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1" engine_client = Mock() - engine_client.reasoning_parser = "ernie_x1" + engine_client.reasoning_parser = "ernie-x1" # 创建一个OpenAIServingCompletion实例 serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) # 创建一个模拟的output,并设置finish_reason为"tool_call" @@ -86,9 +86,9 @@ class TestOpenAIServingCompletion(unittest.TestCase): assert result == "tool_calls" def test_calc_finish_reason_stop(self): - # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie_x1" + # 创建一个模拟的engine_client,并设置reasoning_parser为"ernie-x1" engine_client = Mock() - engine_client.reasoning_parser = "ernie_x1" + engine_client.reasoning_parser = "ernie-x1" # 创建一个OpenAIServingCompletion实例 serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360) # 创建一个模拟的output,并设置finish_reason为其他值 diff --git a/tests/reasoning/test_reasoning_parser.py b/tests/reasoning/test_reasoning_parser.py index 9e06523b02..26a7457db9 100644 --- a/tests/reasoning/test_reasoning_parser.py +++ b/tests/reasoning/test_reasoning_parser.py @@ -91,7 +91,7 @@ class TestReasoningParserManager(unittest.TestCase): Test that a parser can be registered and retrieved successfully. Verifies normal registration and retrieval functionality. """ - ReasoningParserManager.register_module(module=TestReasoningParser, name="test_parser", force=True) + ReasoningParserManager.register_module(module=TestReasoningParser, name="test-parser", force=True) parser_cls = ReasoningParserManager.get_reasoning_parser("test_parser") self.assertIs(parser_cls, TestReasoningParser)