[Optimization]Streaming requests return complete special tokens. (#6998)

* return special token

* add completions

* update

* fix

* add prompt_token_ids&                        completion_token_ids=None,

* fix unite test
This commit is contained in:
luukunn
2026-03-26 09:49:43 +08:00
committed by GitHub
parent d5cb2767d7
commit e6804ba97d
4 changed files with 39 additions and 36 deletions
@@ -553,9 +553,16 @@ class OpenAIServingCompletion:
num_image_tokens[idx] += output.get("num_image_tokens")
reasoning_tokens[idx] += output.get("reasoning_token_num", 0)
output_speculate_metrics = res["metrics"].get("speculate_metrics", None)
if output["tool_calls"] is not None:
tool_called[idx] = True
if output["skipped"] and not request.return_token_ids:
continue
delta_message = CompletionResponseStreamChoice(
index=idx,
text=output["text"],
text="" if output["skipped"] else (output["text"] or ""),
prompt_token_ids=None,
completion_token_ids=output.get("token_ids") if request.return_token_ids else None,
tool_calls=output["tool_calls"],
@@ -570,12 +577,6 @@ class OpenAIServingCompletion:
speculate_metrics=output_speculate_metrics,
)
if output["tool_calls"] is not None:
tool_called[idx] = True
if output["skipped"]:
continue
choices.append(delta_message)
if res["finished"]: