diff --git a/custom_ops/iluvatar_ops/cpp_extensions.cc b/custom_ops/iluvatar_ops/cpp_extensions.cc
new file mode 100644
index 0000000000..c085f2a612
--- /dev/null
+++ b/custom_ops/iluvatar_ops/cpp_extensions.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+#include "pybind11/pybind11.h"
+namespace py = pybind11;
+
+// 自定义异常类，用于处理CUDA错误
+class CudaError : public std::exception {
+ public:
+  explicit CudaError(cudaError_t error) : error_(error) {}
+
+  const char* what() const noexcept override {
+    return cudaGetErrorString(error_);
+  }
+
+ private:
+  cudaError_t error_;
+};
+
+// 检查CUDA错误并抛出异常
+void check_cuda_error(cudaError_t error) {
+  if (error != cudaSuccess) {
+    throw CudaError(error);
+  }
+}
+
+// 封装cudaHostAlloc的Python函数
+uintptr_t cuda_host_alloc(size_t size,
+                          unsigned int flags = cudaHostAllocDefault) {
+  void* ptr = nullptr;
+  check_cuda_error(cudaHostAlloc(&ptr, size, flags));
+  return reinterpret_cast<uintptr_t>(ptr);
+}
+
+// 封装cudaFreeHost的Python函数
+void cuda_host_free(uintptr_t ptr) {
+  check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
+}
+
+paddle::Tensor GetStop(paddle::Tensor& not_need_stop);
+
+void SetStop(paddle::Tensor& not_need_stop, bool flag);
+
+PYBIND11_MODULE(fastdeploy_ops, m) {
+  /**
+   * alloc_cache_pinned.cc
+   * cuda_host_alloc
+   * cuda_host_free
+   */
+  m.def("cuda_host_alloc",
+        &cuda_host_alloc,
+        "Allocate pinned memory",
+        py::arg("size"),
+        py::arg("flags") = cudaHostAllocDefault);
+  m.def(
+      "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
+  py::register_exception<CudaError>(m, "CudaError");
+
+  m.def("get_stop", &GetStop, "get_stop function");
+
+  m.def("set_stop", &SetStop, "set_stop function");
+}
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 1213d92dca..8610511db8 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -564,6 +564,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                 "gpu_ops/recover_decode_task.cu",
                 "gpu_ops/update_inputs_v1.cu",
                 "gpu_ops/get_img_boundaries.cc",
+                "gpu_ops/set_stop.cu",
                 "iluvatar_ops/moe_dispatch.cu",
                 "iluvatar_ops/moe_reduce.cu",
                 "iluvatar_ops/paged_attn.cu",
@@ -571,6 +572,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                 "iluvatar_ops/mixed_fused_attn.cu",
                 "iluvatar_ops/w8a16_group_gemm.cu",
                 "iluvatar_ops/runtime/iluvatar_context.cc",
+                "iluvatar_ops/cpp_extensions.cc",
             ],
             include_dirs=["iluvatar_ops/runtime", "gpu_ops"],
             extra_link_args=[
diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
index 4f090297d3..f907048e46 100644
--- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
+++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py
@@ -102,6 +102,7 @@ def apply_penalty_multi_scores(
             presence_penalties,
             temperature,
             bad_words_token_ids,
+            bad_words_token_len,
             step_idx,
             min_dec_lens,
             eos_token_ids,
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index dc8a485f7d..3b48dd41d1 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -882,6 +882,7 @@ def rebuild_padding(
             seq_lens_decoder,
             seq_lens_encoder,
             batch_id_per_token_output,
+            cu_seqlens_q_output,
             first_token_out,
             enable_logprob,
         )
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 9544657a58..e4ab707f62 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -52,14 +52,15 @@ from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
 from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
 from fastdeploy.model_executor.model_loader import get_model_loader
-from fastdeploy.model_executor.ops.gpu import get_stop, set_stop
 from fastdeploy.platforms import current_platform
 from fastdeploy.worker.input_batch import InputBatch, reorder_split_prefill_and_decode
 
 if current_platform.is_iluvatar():
     from fastdeploy.model_executor.ops.iluvatar import (
+        get_stop,
         recover_decode_task,
         set_data_ipc,
+        set_stop,
         set_value_by_flags_and_idx,
     )
 
@@ -71,7 +72,9 @@ elif current_platform.is_dcu():
     share_external_data = None
 else:
     from fastdeploy.model_executor.ops.gpu import (
+        get_stop,
         recover_decode_task,
+        set_stop,
         set_value_by_flags_and_idx,
         share_external_data,
         speculate_schedule_cache,
diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py
index 63c0222891..baf790d166 100644
--- a/fastdeploy/worker/iluvatar_model_runner.py
+++ b/fastdeploy/worker/iluvatar_model_runner.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """
 
+from functools import partial
+
 import paddle
 
 from fastdeploy import envs
@@ -22,6 +24,24 @@ from fastdeploy.model_executor.layers.attention import IluvatarAttnBackend
 from fastdeploy.worker.gpu_model_runner import GPUModelRunner
 
 
+def _patch_before_model_runner():
+    paddle.Tensor.pin_memory = paddle.Tensor.cpu
+    paddle.device.cuda.create_event = partial(paddle.device.custom_device.create_event, device_type="iluvatar_gpu")
+
+    def disable_record(self):
+        pass
+
+    paddle.device.custom_device.Event.record = disable_record
+
+    def disable_synchronize(self):
+        pass
+
+    paddle.device.custom_device.Event.synchronize = disable_synchronize
+
+
+_patch_before_model_runner()
+
+
 class IluvatarModelRunner(GPUModelRunner):
     def __init__(
         self,
diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh
index 712eed4b2e..7d73595b5b 100644
--- a/scripts/run_ci_iluvatar.sh
+++ b/scripts/run_ci_iluvatar.sh
@@ -11,24 +11,39 @@ unset http_proxy
 unset https_proxy
 unset no_proxy
 
+export FD_LOG_DIR=/fdlog/$HOSTNAME
+echo "FD log will be saved into $FD_LOG_DIR"
 export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
 ln -sf /usr/local/bin/python3 /usr/local/bin/python
 echo "pip requirements"
 python -m pip install -r requirements_iluvatar.txt
 echo "install paddle cpu and custom device"
-python -m pip install  --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
-python -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
-#python -m pip install paddlepaddle==3.3.0.dev20251219 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
-#python -m pip install paddle-iluvatar-gpu==3.0.0.dev20251223 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+# python -m pip install  --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+# python -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+python -m pip install paddlepaddle==3.4.0.dev20260206 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+python -m pip install paddle-iluvatar-gpu==3.0.0.dev20260206 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+
+INCLUDE_FOLDERS=(
+    "ERNIE_300B_4L"
+    "ERNIE-4.5-21B-A3B-Paddle"
+    "ERNIE-4.5-VL-28B-A3B-Paddle"
+    "PaddleOCR-VL"
+)
 
 MODEL_DIR=/model_data
 mkdir -p $MODEL_DIR
 SOURCE_DIR=/aistudio/paddle_ci
-for file in "$SOURCE_DIR"/*; do
+echo "ls $SOURCE_DIR"
+ls $SOURCE_DIR
+
+for filename in "${INCLUDE_FOLDERS[@]}"; do
+    file=$SOURCE_DIR/$filename
     echo "start copy $file into $MODEL_DIR ..."
     cp -r $file $MODEL_DIR
 done
+
 echo "copy done"
+echo "ls $MODEL_DIR"
 ls $MODEL_DIR
 
 echo "build whl"