[Model Runner] Deprecate not_need_stop (#6356)

* Deprecate not_need_stop
2026-04-22 16:07:51 +08:00 · 2026-03-05 10:55:42 +08:00
parent fa4815b93a
commit 0dc7034ce0
12 changed files with 534 additions and 165 deletions
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/extension.h"
+#include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 namespace py = pybind11;

@@ -49,9 +50,19 @@ void cuda_host_free(uintptr_t ptr) {
  check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
 }

-paddle::Tensor GetStop(paddle::Tensor& not_need_stop);
-
-void SetStop(paddle::Tensor& not_need_stop, bool flag);
+paddle::Tensor CustomNumpyToTensor(py::array numpy_array,
+                                   paddle::Tensor tensor) {
+  py::buffer_info buf_info = numpy_array.request();
+  void* numpy_data = buf_info.ptr;
+  size_t data_size = buf_info.size * buf_info.itemsize;
+  auto stream = tensor.stream();
+  cudaMemcpyAsync((void*)(tensor.data()),
+                  numpy_data,
+                  data_size,
+                  cudaMemcpyHostToDevice,
+                  stream);
+  return tensor;
+}

 void FlashAttentionMask(const paddle::Tensor& q_input,
                        const paddle::Tensor& k_input,
@@ -1722,7 +1733,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {

  m.def("get_attn_mask_q", &get_attn_mask_q, "get_attn_mask_q function");

-  m.def("get_stop", &GetStop, "get_stop function");
-
-  m.def("set_stop", &SetStop, "set_stop function");
+  m.def("custom_numpy_to_tensor",
+        &CustomNumpyToTensor,
+        "custom_numpy_to_tensor function");
 }
@@ -1,30 +0,0 @@
-// Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "helper.h"
-
-paddle::Tensor GetStop(paddle::Tensor& not_need_stop) {
-  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
-  auto not_need_stop_cpu =
-      GetEmptyTensor({1}, paddle::DataType::BOOL, paddle::CPUPlace());
-  bool* not_need_stop_cpu_data =
-      const_cast<bool*>(not_need_stop_cpu.data<bool>());
-  not_need_stop_cpu_data[0] = not_need_stop_data[0];
-  return not_need_stop_cpu;
-}
-
-void SetStop(paddle::Tensor& not_need_stop, bool flag) {
-  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
-  not_need_stop_data[0] = flag;
-}
@@ -155,15 +155,6 @@ std::vector<paddle::Tensor> SpeculatePreProcess(
  auto cu_seqlens_k =
      paddle::empty({bsz + 1}, paddle::DataType::INT32, input_ids.place());

-  if (token_num_data == 0) {
-    return {ids_remove_padding,
-            batch_id_per_token,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            paddle::Tensor(),
-            paddle::Tensor(),
-            paddle::Tensor()};
-  }
 #ifdef PADDLE_WITH_COREX
  int blockSize =
      std::min((token_num_data + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE, 128);
@@ -185,6 +176,16 @@ std::vector<paddle::Tensor> SpeculatePreProcess(
  auto real_output_token_num =
      paddle::empty({1}, paddle::DataType::INT32, input_ids.place());

+  if (token_num_data == 0) {
+    return {ids_remove_padding,
+            batch_id_per_token,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            cu_seq_lens_q_output,
+            batch_id_per_token_output,
+            real_output_token_num};
+  }
+
  int64_t *ids_remove_padding_ptr = ids_remove_padding.data<int64_t>();
  int *batch_id_per_token_ptr = batch_id_per_token.data<int>();
  int *cu_seqlens_q_ptr = cu_seqlens_q.data<int>();
@@ -49,10 +49,6 @@ void cuda_host_free(uintptr_t ptr) {
  check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
 }

-paddle::Tensor GetStop(paddle::Tensor& not_need_stop);
-
-void SetStop(paddle::Tensor& not_need_stop, bool flag);
-
 PYBIND11_MODULE(fastdeploy_ops, m) {
  /**
   * alloc_cache_pinned.cc
@@ -67,8 +63,4 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def(
      "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
  py::register_exception<CudaError>(m, "CudaError");
-
-  m.def("get_stop", &GetStop, "get_stop function");
-
-  m.def("set_stop", &SetStop, "set_stop function");
 }
@@ -49,10 +49,6 @@ void cuda_host_free(uintptr_t ptr) {
  check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
 }

-paddle::Tensor GetStop(paddle::Tensor& not_need_stop);
-
-void SetStop(paddle::Tensor& not_need_stop, bool flag);
-
 std::vector<paddle::Tensor> SpeculatePreProcess(
    const int64_t cpu_token_num,
    const paddle::Tensor& input_ids,
@@ -76,10 +72,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
      "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
  py::register_exception<CudaError>(m, "CudaError");

-  m.def("get_stop", &GetStop, "get_stop function");
-
-  m.def("set_stop", &SetStop, "set_stop function");
-
  m.def("speculate_pre_process",
        &SpeculatePreProcess,
        "speculate_pre_process function");
@@ -270,7 +270,6 @@ elif paddle.is_compiled_with_cuda():
        "gpu_ops/stop_generation.cu",
        "gpu_ops/stop_generation_multi_ends.cu",
        "gpu_ops/set_flags.cu",
-        "gpu_ops/set_stop.cu",
        "gpu_ops/update_inputs_v1.cu",
        "gpu_ops/recover_decode_task.cu",
        "gpu_ops/step.cu",
@@ -562,7 +561,6 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                "gpu_ops/recover_decode_task.cu",
                "gpu_ops/update_inputs_v1.cu",
                "gpu_ops/get_img_boundaries.cc",
-                "gpu_ops/set_stop.cu",
                "gpu_ops/fused_neox_rope_embedding.cu",
                "gpu_ops/get_output_ep.cc",
                "iluvatar_ops/moe_dispatch.cu",
@@ -649,7 +647,6 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
        "gpu_ops/unset_data_ipc.cu",
        "gpu_ops/swap_cache_batch.cu",
        "gpu_ops/gelu_tanh.cu",
-        "gpu_ops/set_stop.cu",
        "metax_ops/moe_dispatch.cu",
        "metax_ops/moe_ffn.cu",
        "metax_ops/moe_reduce.cu",
@@ -101,6 +101,7 @@ else:
        speculate_step_reschedule,
        limit_thinking_content_length,
        speculate_limit_thinking_content_length,
+        custom_numpy_to_tensor,
    )

 from fastdeploy.model_executor.entropy_utils import (
@@ -114,6 +115,37 @@ from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, SamplerOu

 DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"

+if current_platform.is_cuda():
+
+    def async_set_value(tgt, src):
+        if isinstance(src, (int, float, bool)):
+            src = paddle.full(tgt.shape, fill_value=src, dtype=tgt.dtype)
+        elif isinstance(src, (list, np.array)):
+            dtype_str = str(tgt.dtype).split(".")[1]
+            if isinstance(src, list):
+                src = np.array(src, dtype=dtype_str if dtype_str != "bfloat16" else "float32")
+            if str(src.dtype) != dtype_str:
+                srt_tensor = paddle.empty(tgt.shape, dtype=str(src.dtype))
+                src = custom_numpy_to_tensor(src, srt_tensor)
+            else:
+                return custom_numpy_to_tensor(src, tgt)
+        elif isinstance(src, paddle.Tensor):
+            pass
+        else:
+            raise ValueError("async_set_value unsupported src type: {}".format(type(src)))
+        if src.shape != tgt.shape:
+            src = src.reshape(tgt.shape)
+        if src.dtype != tgt.dtype:
+            src = src.cast(tgt.dtype)
+        if src.place != tgt.place:
+            src = src.to(tgt.place)
+        tgt.copy_(src, blocking=False)
+
+else:
+
+    def async_set_value(*args, **kwargs):
+        raise RuntimeError("async_set_value is only available on CUDA")
+

 def pre_process(
    token_num_cpu: int,
@@ -870,7 +902,7 @@ def post_process_pooling(
            )
            update_inputs_v1(
                model_output.stop_flags,
-                model_output.not_need_stop,
+                model_output.not_need_stop_device,
                model_output.seq_lens_this_time,
                model_output.seq_lens_encoder,
                model_output.seq_lens_decoder,
@@ -57,10 +57,8 @@ from fastdeploy.worker.input_batch import InputBatch, reorder_split_prefill_and_

 if current_platform.is_iluvatar():
    from fastdeploy.model_executor.ops.iluvatar import (
-        get_stop,
        recover_decode_task,
        set_data_ipc,
-        set_stop,
        set_value_by_flags_and_idx,
    )

@@ -72,9 +70,7 @@ elif current_platform.is_dcu():
    share_external_data = None
 else:
    from fastdeploy.model_executor.ops.gpu import (
-        get_stop,
        recover_decode_task,
-        set_stop,
        set_value_by_flags_and_idx,
        share_external_data,
        speculate_schedule_cache,
@@ -83,6 +79,7 @@ else:
    )

 from fastdeploy.model_executor.pre_and_post_process import (
+    async_set_value,
    post_process,
    pre_process,
    rebuild_padding,
@@ -267,14 +264,16 @@ class GPUModelRunner(ModelRunnerBase):
        )

        # for overlap
-        self.last_model_output_data = None
-        self.last_sampler_output = None
-        self.last_post_process_event = None
-        self.last_token_num = -1
-
+        self._cached_model_output_data = None
+        self._cached_sampler_output = None
+        self._cached_post_process_event = None
+        # Cached token count for next batch prediction in overlap scheduling.
+        # Used to avoid synchronization overhead when preparing inputs for the next batch.
+        self._cached_launch_token_num = -1
        self.enable_overlap_schedule = fd_config.scheduler_config.enable_overlap_schedule and (
            not self.speculative_decoding
        )
+        self.current_launch_token_num = 0

    def _async_output_busy_loop(self):
        """Entrypoint for the thread which handles outputs asynchronously."""
@@ -297,6 +296,46 @@ class GPUModelRunner(ModelRunnerBase):
        """
        return (self.share_inputs["seq_lens_decoder"] > 0).any().cpu().numpy().item()

+    def _resolve_current_launch_token_num(
+        self, cached_token_num: int, token_num_event, is_dummy_or_profile_run: bool
+    ) -> int:
+        """
+        Resolve token count for current batch.
+
+        In overlap mode, uses cached value from previous batch prediction to avoid GPU-CPU sync.
+        Falls back to fresh computation in certain conditions:
+        - dummy/profile runs need accurate counts
+        - non-overlap mode doesn't support caching
+        - prefill stage changes batch composition
+        - invalid cached value
+        """
+        if (
+            is_dummy_or_profile_run
+            or (not self.enable_overlap_schedule)
+            or self.exist_prefill()
+            or cached_token_num <= 0
+        ):
+            token_num_event.synchronize()
+            return self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item()
+        return cached_token_num
+
+    def _predict_next_launch_token_num(self) -> int:
+        """
+        Predict token count for next batch.
+
+        In overlap scheduling, while current batch executes model forward,
+        the scheduler may have prepared decode requests for next batch.
+        This prediction allows next batch to skip synchronization.
+
+        Returns -1 if prediction is not applicable (non-overlap or prefill exists).
+        """
+        if self.exist_prefill():
+            return -1
+        return (
+            self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item()
+            + self.share_inputs["is_block_step_cpu"].numpy().sum().item()
+        )
+
    def only_prefill(self):
        """
        check whether prefill only
@@ -711,8 +750,6 @@ class GPUModelRunner(ModelRunnerBase):
            self.initialize_kv_cache()

        req_len = len(req_dicts)
-        has_prefill_task = False
-        has_decode_task = False

        batch_pooling_params = []
        self.share_inputs["num_running_requests"] = num_running_requests
@@ -828,7 +865,6 @@ class GPUModelRunner(ModelRunnerBase):
                if request.sampling_params is not None and request.sampling_params.prompt_logprobs is not None:
                    self.prompt_logprobs_reqs[request.request_id] = request
                self.forward_batch_reqs_list[idx] = request
-                has_prefill_task = True

                if self.speculative_decoding and self.speculative_method == "suffix" and self.proposer is not None:
                    if isinstance(request.prompt_token_ids, np.ndarray):
@@ -859,11 +895,14 @@ class GPUModelRunner(ModelRunnerBase):
                encoder_block_num = len(request.block_tables)
                self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num
                self.share_inputs["block_tables"][idx : idx + 1, :] = -1
-                self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
-                    request.block_tables, dtype="int32"
-                )
-                if self.share_inputs["is_block_step"][idx]:  # has tasks to continue to decode
-                    has_decode_task = True
+                if current_platform.is_cuda():
+                    async_set_value(
+                        self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num], request.block_tables
+                    )
+                else:
+                    self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array(
+                        request.block_tables, dtype="int32"
+                    )
                self.share_inputs["preempted_idx"][idx : idx + 1, :] = 0
                continue
            else:  # preempted task
@@ -940,8 +979,6 @@ class GPUModelRunner(ModelRunnerBase):
            self.sampler.apply_logits_processor(idx, logits_info, prefill_tokens)

        self._process_mm_features(req_dicts)
-        if has_prefill_task or has_decode_task:
-            set_stop(self.share_inputs["not_need_stop"], True)

        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:num_running_requests]
        if self.speculative_method in ["mtp"]:
@@ -1067,7 +1104,7 @@ class GPUModelRunner(ModelRunnerBase):
            )
        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"]

-    def _prepare_inputs(self, last_token_num=-1, is_dummy_or_profile_run=False) -> None:
+    def _prepare_inputs(self, cached_token_num=-1, is_dummy_or_profile_run=False) -> None:
        """Prepare the model inputs"""
        if self.enable_mm and self.share_inputs["image_features_list"] is not None:
            tensor_feats = [t for t in self.share_inputs["image_features_list"] if isinstance(t, paddle.Tensor)]
@@ -1115,16 +1152,7 @@ class GPUModelRunner(ModelRunnerBase):
        self.share_inputs["is_block_step_cpu"].copy_(self.share_inputs["is_block_step"], False)
        token_num_event = paddle.device.cuda.create_event()
        token_num_event.record()
-        if (
-            is_dummy_or_profile_run
-            or (not self.enable_overlap_schedule)
-            or self.exist_prefill()
-            or last_token_num <= 0
-        ):
-            token_num_event.synchronize()
-            token_num = self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item()
-        else:
-            token_num = last_token_num
+        token_num = self._resolve_current_launch_token_num(cached_token_num, token_num_event, is_dummy_or_profile_run)
        (
            ids_remove_padding,
            batch_id_per_token,
@@ -1189,7 +1217,7 @@ class GPUModelRunner(ModelRunnerBase):
            logits_processors=self.share_inputs["logits_processors"],
            share_inputs=self.share_inputs,
        )
-        return token_num_event
+        return token_num, token_num_event

    def _process_reorder(self) -> None:
        if self.attn_backends and getattr(self.attn_backends[0], "enable_ids_reorder", False):
@@ -1968,14 +1996,17 @@ class GPUModelRunner(ModelRunnerBase):
        model_forward_batch: Optional[List[Request]] = None,
        num_running_requests: int = None,
    ) -> None:
-        model_output, p_done_idxs, token_num_event = self._preprocess_and_execute_model(
-            model_forward_batch, num_running_requests
-        )
-        model_output_data, sampler_output, post_process_event, _ = self._postprocess(
-            model_output, p_done_idxs, token_num_event, model_forward_batch, num_running_requests
+        model_output, p_done_idxs, _ = self._preprocess_and_execute_model(model_forward_batch, num_running_requests)
+        if model_output is None:
+            return
+
+        model_output_data, sampler_output, post_process_event = self._postprocess(
+            model_output, p_done_idxs, model_forward_batch, num_running_requests
        )
        if model_output_data is not None and not self.speculative_decoding:
-            self._save_model_output(model_output_data, sampler_output, post_process_event)
+            # synchronizes the async DtoH copies of sampled_token_ids.
+            post_process_event.synchronize()
+            self._save_model_output(model_output_data, sampler_output)

    def execute_model_overlap(
        self,
@@ -1984,40 +2015,61 @@ class GPUModelRunner(ModelRunnerBase):
    ) -> None:
        # preprocess and execute model (current batch)
        model_output, p_done_idxs, token_num_event = self._preprocess_and_execute_model(
-            model_forward_batch, num_running_requests, self.last_token_num
+            model_forward_batch, num_running_requests, self._cached_launch_token_num
        )

        # save output (last batch)
-        if self.last_model_output_data is not None and not self.speculative_decoding:
+        if self._cached_model_output_data is not None:
+            # synchronizes the async DtoH copies of sampled_token_ids.
+            self._cached_post_process_event.synchronize()
            self._save_model_output(
-                self.last_model_output_data, self.last_sampler_output, self.last_post_process_event
+                self._cached_model_output_data,
+                self._cached_sampler_output,
            )

        # postprocess (current batch)
-        model_output_data, sampler_output, post_process_event, token_num = self._postprocess(
-            model_output, p_done_idxs, token_num_event, model_forward_batch, num_running_requests
-        )
-        self.last_model_output_data = model_output_data
-        self.last_sampler_output = sampler_output
-        self.last_post_process_event = post_process_event
-        self.last_token_num = token_num
+        # synchronizes the async DtoH copies of seq_lens_this_time_cpu and is_block_step_cpu,
+        # ensuring that the token count for the current batch is ready to be computed and reused in the subsequent batch.
+        token_num_event.synchronize()
+        next_launch_token_num = self._predict_next_launch_token_num()
+        if self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item() > 0 and model_output is not None:
+            model_output_data, sampler_output, post_process_event = self._postprocess(
+                model_output, p_done_idxs, model_forward_batch, num_running_requests
+            )
+            self._cached_model_output_data = model_output_data
+            self._cached_sampler_output = sampler_output
+            self._cached_post_process_event = post_process_event
+        else:
+            self._cached_model_output_data = None
+            self._cached_sampler_output = None
+            self._cached_post_process_event = None
+        self._cached_launch_token_num = next_launch_token_num

    def _preprocess_and_execute_model(
        self,
        model_forward_batch: Optional[List[Request]] = None,
        num_running_requests: int = None,
-        last_token_num: int = -1,
+        cached_token_num: int = -1,
    ) -> None:
        if self.deterministic_logger is not None:
            self.deterministic_logger.log_batch_start(model_forward_batch)

-        # 1. Prepare inputs of model and sampler.
-        p_done_idxs = self._get_p_done_idxs_gd(model_forward_batch, num_running_requests)
-
        # Reorder inputs to split prefill and decode tokens
        self._process_reorder()

-        token_num_event = self._prepare_inputs(last_token_num)
+        # 1. Prepare inputs of model and sampler.
+        current_launch_token_num, token_num_event = self._prepare_inputs(cached_token_num)
+        self.current_launch_token_num = current_launch_token_num
+
+        # NOTE(sunxin):
+        # If current_launch_token_num is 0, it means the current worker is in an idle state,
+        # and no further processing is required in TP mode.
+        # However, in EP (Expert Parallelism) mode, there is data on other runner,
+        # the current runner is required to execute part of the model.
+        if current_launch_token_num == 0 and not self.parallel_config.use_ep:
+            return None, None, token_num_event
+
+        p_done_idxs = self._get_p_done_idxs_gd(model_forward_batch, num_running_requests)
        self.sampler.pre_process(p_done_idxs)

        # 1.1 Update state of logits processor
@@ -2039,39 +2091,18 @@ class GPUModelRunner(ModelRunnerBase):
                ids_remove_padding=self.forward_meta.ids_remove_padding,
                forward_meta=self.forward_meta,
            )
+        if self.use_cudagraph:
+            model_output = model_output[: self.real_token_num]
        return model_output, p_done_idxs, token_num_event

    def _postprocess(
        self,
        model_output: paddle.Tensor,
        p_done_idxs: List[int],
-        token_num_event,
        model_forward_batch: Optional[List[Request]] = None,
        num_running_requests: int = None,
    ) -> None:

-        # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
-        # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
-        # Then there is data on other runner, the current runner is required to execute part of the model.
-        # But not need to run the below code.
-        if not self.not_need_stop():
-            return None, None, None, -1
-
-        if self.use_cudagraph:
-            model_output = model_output[: self.real_token_num]
-
-        # NOTE(sunxin):
-        # token_num_event synchronizes the async DtoH copies of seq_lens_this_time_cpu and is_block_step_cpu,
-        # ensuring that the token count for the current batch is ready to be computed and reused in the subsequent batch.
-        token_num_event.synchronize()
-        if (not self.enable_overlap_schedule) or self.exist_prefill():
-            token_num = -1
-        else:
-            token_num = (
-                self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item()
-                + self.share_inputs["is_block_step_cpu"].numpy().sum().item()
-            )
-
        if self.speculative_decoding:
            self.output_token_num_event.synchronize()
            real_num = int(self._real_output_token_num_host)
@@ -2125,9 +2156,8 @@ class GPUModelRunner(ModelRunnerBase):
                async_output_queue=self.async_output_queue,
                enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0,
            )
-            self.share_inputs["not_need_stop"].copy_(self.share_inputs["not_need_stop_device"], True)

-            return None, None, None, -1
+            return None, None, None
        else:
            hidden_states = rebuild_padding(
                model_output,
@@ -2323,7 +2353,6 @@ class GPUModelRunner(ModelRunnerBase):
            post_process_event = paddle.device.cuda.create_event()
            if not self.speculative_decoding:
                self.share_inputs["sampled_token_ids"].copy_(sampler_output.sampled_token_ids, False)
-                self.share_inputs["not_need_stop"].copy_(self.share_inputs["not_need_stop_device"], False)
                post_process_event.record()

        self.exist_prefill_flag = False
@@ -2336,17 +2365,13 @@ class GPUModelRunner(ModelRunnerBase):
                and self.share_inputs["is_chunk_step"].sum() == 0
            ):
                self.routing_replay_manager.put_table_to_store()
-        return model_output_data, sampler_output, post_process_event, token_num
+        return model_output_data, sampler_output, post_process_event

    def _save_model_output(
        self,
        model_output_data,
        sampler_output,
-        post_process_event,
    ):
-        # NOTE(sunxin):
-        # post_process_event synchronizes the async DtoH copies of not_need_stop and sampled_token_ids.
-        post_process_event.synchronize()
        save_output_normal(
            model_output=model_output_data,
            sampler_output=sampler_output,
@@ -2515,11 +2540,6 @@ class GPUModelRunner(ModelRunnerBase):
            required_memory = byte_of_dtype * 2 * (self.cache_config.block_size * hidden_dim) * num_layers  # k + v
        return required_memory

-    # TODO(sunxin): Remove not_need_stop!!!
-    def not_need_stop(self) -> bool:
-        """Stop decoding if the tensor meets the termination condition"""
-        return get_stop(self.share_inputs["not_need_stop"]).item()
-
    def clear_cache(self, profile=False):
        """Clear cached data from shared inputs and forward metadata"""
        create_cache_tensor = profile or not (
@@ -152,7 +152,7 @@ class InputBatch:
            self.seq_lens_this_time_cpu = paddle.full([max_num_seqs, 1], 0, dtype="int32", device="cpu")
            self.is_block_step_cpu = paddle.full([max_num_seqs], False, dtype="bool", device="cpu")
        else:
-            self.not_need_stop = paddle.full([1], False, dtype="bool").pin_memory()
+            self.not_need_stop = paddle.full([1], False, dtype="bool").cpu()
            self.sampled_token_ids = paddle.full([max_num_seqs, 1], -1, dtype="int64").pin_memory()
            self.seq_lens_this_time_cpu = paddle.full([max_num_seqs, 1], 0, dtype="int32").pin_memory()
            self.is_block_step_cpu = paddle.full([max_num_seqs], False, dtype="bool").pin_memory()
@@ -62,10 +62,8 @@ from fastdeploy.model_executor.model_loader import get_model_loader
 from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp
 from fastdeploy.model_executor.models.interfaces_base import FdModelForPooling
 from fastdeploy.model_executor.ops.gpu import (
-    get_stop,
    recover_decode_task,
    set_data_ipc,
-    set_stop,
    set_value_by_flags_and_idx,
    share_external_data,
    speculate_schedule_cache,
@@ -884,7 +882,7 @@ class MetaxModelRunner(ModelRunnerBase):

        self._process_mm_features(req_dicts)
        if has_prefill_task or has_decode_task:
-            set_stop(self.share_inputs["not_need_stop"], True)
+            self.share_inputs["not_need_stop"][0] = True

        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:num_running_requests]
        if self.speculative_method in ["mtp"]:
@@ -967,7 +965,7 @@ class MetaxModelRunner(ModelRunnerBase):

            self.sampler.apply_logits_processor(idx, logits_info, prefill_tokens)

-        set_stop(self.share_inputs["not_need_stop"], True)
+        self.share_inputs["not_need_stop"][0] = True

        self.share_inputs["seq_lens_this_time"] = self.share_inputs["seq_lens_this_time_buffer"][:num_running_requests]

@@ -2489,7 +2487,7 @@ class MetaxModelRunner(ModelRunnerBase):

    def not_need_stop(self) -> bool:
        """Stop decoding if the tensor meets the termination condition"""
-        return get_stop(self.share_inputs["not_need_stop"]).item()
+        return self.share_inputs["not_need_stop"][0]

    def clear_cache(self, profile=False):
        """Clear cached data from shared inputs and forward metadata"""
@@ -171,8 +171,10 @@ class PaddleDisWorkerProc:

        self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
        self.speculative_decoding = fd_config.speculative_config.method is not None
-        self.enable_overlap_schedule = self.scheduler_config.enable_overlap_schedule and (
-            not self.speculative_decoding
+        self.enable_overlap_schedule = (
+            current_platform.is_cuda()
+            and self.scheduler_config.enable_overlap_schedule
+            and (not self.speculative_decoding)
        )

    def init_control(self):
@@ -560,12 +562,11 @@ class PaddleDisWorkerProc:
                self.worker.preprocess_new_task(req_dicts, max_occupied_batch_index)

            if (
-                (not self.parallel_config.use_ep)
-                and (not self.worker.model_runner.not_need_stop())
-                and (not self.enable_overlap_schedule)
+                not self.parallel_config.use_ep
+                and not current_platform.is_cuda()
+                and not self.worker.model_runner.not_need_stop()
            ):
                self._tp_barrier_wait() if tp_size > 1 else None
-
                time.sleep(0.001)
                continue

@@ -578,6 +579,14 @@ class PaddleDisWorkerProc:
                self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill()
            logger.debug(f"execute model cost: {time.time()-start_execute_time:.5f} s")

+            if (
+                not self.parallel_config.use_ep
+                and current_platform.is_cuda()
+                and self.worker.model_runner.current_launch_token_num == 0
+            ):
+                self._tp_barrier_wait() if tp_size > 1 else None
+                time.sleep(0.001)
+
    def initialize_kv_cache(self) -> None:
        """Profiles the peak memory usage of the model to determine how many
        KV blocks may be allocated without OOMs.
@@ -0,0 +1,347 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import shutil
+import signal
+import subprocess
+import sys
+import time
+
+import pytest
+import requests
+
+tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.insert(0, tests_dir)
+
+from e2e.utils.serving_utils import (
+    FD_API_PORT,
+    FD_CACHE_QUEUE_PORT,
+    FD_ENGINE_QUEUE_PORT,
+    FD_METRICS_PORT,
+    clean_ports,
+    is_port_open,
+)
+
+os.environ["FD_ATTENTION_BACKEND"] = "FLASH_ATTN"
+os.environ["FLAGS_flash_attn_version"] = "3"
+os.environ["FD_SAMPLING_CLASS"] = "rejection"
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup_and_run_server():
+    """
+    Pytest fixture that runs once per test session:
+    - Cleans ports before tests
+    - Starts the API server as a subprocess
+    - Waits for server port to open (up to 30 seconds)
+    - Tears down server after all tests finish
+    """
+    print("Pre-test port cleanup...")
+    clean_ports()
+    print("log dir clean ")
+    if os.path.exists("log") and os.path.isdir("log"):
+        shutil.rmtree("log")
+
+    base_path = os.getenv("MODEL_PATH")
+    if base_path:
+        model_path = os.path.join(base_path, "Qwen3-30B-A3B")
+    else:
+        model_path = "./Qwen3-30B-A3B"
+
+    log_path = "server.log"
+    cmd = [
+        sys.executable,
+        "-m",
+        "fastdeploy.entrypoints.openai.api_server",
+        "--model",
+        model_path,
+        "--port",
+        str(FD_API_PORT),
+        "--tensor-parallel-size",
+        "4",
+        "--engine-worker-queue-port",
+        str(FD_ENGINE_QUEUE_PORT),
+        "--metrics-port",
+        str(FD_METRICS_PORT),
+        "--cache-queue-port",
+        str(FD_CACHE_QUEUE_PORT),
+        "--max-model-len",
+        "32768",
+        "--max-num-seqs",
+        "50",
+        "--enable-overlap-schedule",
+    ]
+
+    # Start subprocess in new process group
+    with open(log_path, "w") as logfile:
+        process = subprocess.Popen(
+            cmd,
+            stdout=logfile,
+            stderr=subprocess.STDOUT,
+            start_new_session=True,  # Enables killing full group via os.killpg
+        )
+
+    # Wait up to 300 seconds for API server to be ready
+    for _ in range(480):
+        if is_port_open("127.0.0.1", FD_API_PORT):
+            print(f"API server is up on port {FD_API_PORT}")
+            break
+        time.sleep(1)
+    else:
+        print("API server failed to start in time. Cleaning up...")
+        try:
+            os.killpg(process.pid, signal.SIGTERM)
+        except Exception as e:
+            print(f"Failed to kill process group: {e}")
+        raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
+
+    yield  # Run tests
+
+    print("\n===== Post-test server cleanup... =====")
+    try:
+        os.killpg(process.pid, signal.SIGTERM)
+        print(f"API server (pid={process.pid}) terminated")
+    except Exception as e:
+        print(f"Failed to terminate API server: {e}")
+
+
+@pytest.fixture(scope="session")
+def api_url(request):
+    """
+    Returns the API endpoint URL for chat completions.
+    """
+    return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions"
+
+
+@pytest.fixture(scope="session")
+def metrics_url(request):
+    """
+    Returns the metrics endpoint URL.
+    """
+    return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
+
+
+@pytest.fixture
+def headers():
+    """
+    Returns common HTTP request headers.
+    """
+    return {"Content-Type": "application/json"}
+
+
+@pytest.fixture
+def consistent_payload():
+    """
+    Returns a fixed payload for consistency testing,
+    including a fixed random seed and temperature.
+    """
+    return {
+        "messages": [
+            {
+                "role": "user",
+                "content": "用一句话介绍 PaddlePaddle, 30字以内 /no_think",
+            }
+        ],
+        "temperature": 0.8,
+        "top_p": 0,  # fix top_p to reduce randomness
+        "seed": 13,  # fixed random seed
+    }
+
+
+# ==========================
+# Helper function to calculate difference rate between two texts
+# ==========================
+def calculate_diff_rate(text1, text2):
+    """
+    Calculate the difference rate between two strings
+    based on the normalized Levenshtein edit distance.
+    Returns a float in [0,1], where 0 means identical.
+    """
+    if text1 == text2:
+        return 0.0
+
+    len1, len2 = len(text1), len(text2)
+    dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
+
+    for i in range(len1 + 1):
+        for j in range(len2 + 1):
+            if i == 0 or j == 0:
+                dp[i][j] = i + j
+            elif text1[i - 1] == text2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
+            else:
+                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
+
+    edit_distance = dp[len1][len2]
+    max_len = max(len1, len2)
+    return edit_distance / max_len if max_len > 0 else 0.0
+
+
+# ==========================
+# Consistency test for repeated runs with fixed payload
+# ==========================
+def test_consistency_between_runs(api_url, headers, consistent_payload):
+    """
+    Test that two runs with the same fixed input produce similar outputs.
+    """
+    # First request
+    resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp1.status_code == 200
+    result1 = resp1.json()
+    content1 = result1["choices"][0]["message"]["content"]
+
+    # Second request
+    resp2 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp2.status_code == 200
+    result2 = resp2.json()
+    content2 = result2["choices"][0]["message"]["content"]
+
+    # Calculate difference rate
+    diff_rate = calculate_diff_rate(content1, content2)
+
+    # Verify that the difference rate is below the threshold
+    assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})"
+
+
+# ==========================
+# think Prompt Test
+# ==========================
+
+
+def test_thinking_prompt(api_url, headers):
+    """
+    Test case to verify normal 'thinking' behavior (no '/no_think' appended).
+    """
+    messages = [{"role": "user", "content": "北京天安门在哪里"}]
+
+    payload = {
+        "messages": messages,
+        "max_tokens": 100,
+        "temperature": 0.8,
+        "top_p": 0.01,
+    }
+
+    resp = requests.post(api_url, headers=headers, json=payload)
+    assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}"
+
+    try:
+        response_json = resp.json()
+    except Exception as e:
+        assert False, f"Response is not valid JSON: {e}"
+
+    content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower()
+    assert "天安门" in content or "北京" in content, "Expected a location-related response with reasoning"
+
+
+# ==========================
+# no_think Prompt Test
+# ==========================
+
+
+def test_non_thinking_prompt(api_url, headers):
+    """
+    Test case to verify non-thinking behavior (with '/no_think').
+    """
+    messages = [{"role": "user", "content": "北京天安门在哪里 /no_think"}]
+
+    payload = {
+        "messages": messages,
+        "max_tokens": 100,
+        "temperature": 0.8,
+        "top_p": 0.01,
+    }
+
+    resp = requests.post(api_url, headers=headers, json=payload)
+    assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}"
+
+    try:
+        response_json = resp.json()
+    except Exception as e:
+        assert False, f"Response is not valid JSON: {e}"
+
+    content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower()
+    assert not any(
+        x in content for x in ["根据", "我认为", "推测", "可能"]
+    ), "Expected no reasoning in non-thinking response"
+
+
+def test_profile_reset_block_num():
+    """测试profile reset_block_num功能，与baseline diff不能超过5%"""
+    log_file = "./log/config.log"
+    baseline = 40000
+
+    if not os.path.exists(log_file):
+        pytest.fail(f"Log file not found: {log_file}")
+
+    with open(log_file, "r") as f:
+        log_lines = f.readlines()
+
+    target_line = None
+    for line in log_lines:
+        if "Reset block num" in line:
+            target_line = line.strip()
+            break
+
+    if target_line is None:
+        pytest.fail("日志中没有Reset block num信息")
+
+    match = re.search(r"total_block_num:(\d+)", target_line)
+    if not match:
+        pytest.fail(f"Failed to extract total_block_num from line: {target_line}")
+
+    try:
+        actual_value = int(match.group(1))
+    except ValueError:
+        pytest.fail(f"Invalid number format: {match.group(1)}")
+
+    lower_bound = baseline * (1 - 0.05)
+    upper_bound = baseline * (1 + 0.05)
+    print(f"Reset total_block_num: {actual_value}. baseline: {baseline}")
+
+    assert lower_bound <= actual_value <= upper_bound, (
+        f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
+        f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
+    )
+
+
+def test_thinking_with_stop_token_ids(api_url, headers):
+    """
+    Test case to verify thinking behavior when stop token ids are provided.
+    """
+    messages = [{"role": "user", "content": "北京天安门在哪里"}]
+
+    payload = {
+        "messages": messages,
+        "max_tokens": 100,
+        "temperature": 0.8,
+        "seed": 1,
+        "stop_token_ids": [105930],
+    }
+
+    resp = requests.post(api_url, headers=headers, json=payload)
+    assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}"
+
+    try:
+        response_json = resp.json()
+    except Exception as e:
+        assert False, f"Response is not valid JSON: {e}"
+
+    content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "")
+
+    expected_output = "<think>\n好的，用户问“北京天安门在哪里"
+    assert content == expected_output, (
+        f"Unexpected response content.\n" f"Expected: {expected_output!r}\n" f"Actual:   {content!r}"
+    )