diff --git a/custom_ops/iluvatar_ops/cpp_extensions.cc b/custom_ops/iluvatar_ops/cpp_extensions.cc new file mode 100644 index 0000000000..c085f2a612 --- /dev/null +++ b/custom_ops/iluvatar_ops/cpp_extensions.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" +#include "pybind11/pybind11.h" +namespace py = pybind11; + +// 自定义异常类,用于处理CUDA错误 +class CudaError : public std::exception { + public: + explicit CudaError(cudaError_t error) : error_(error) {} + + const char* what() const noexcept override { + return cudaGetErrorString(error_); + } + + private: + cudaError_t error_; +}; + +// 检查CUDA错误并抛出异常 +void check_cuda_error(cudaError_t error) { + if (error != cudaSuccess) { + throw CudaError(error); + } +} + +// 封装cudaHostAlloc的Python函数 +uintptr_t cuda_host_alloc(size_t size, + unsigned int flags = cudaHostAllocDefault) { + void* ptr = nullptr; + check_cuda_error(cudaHostAlloc(&ptr, size, flags)); + return reinterpret_cast(ptr); +} + +// 封装cudaFreeHost的Python函数 +void cuda_host_free(uintptr_t ptr) { + check_cuda_error(cudaFreeHost(reinterpret_cast(ptr))); +} + +paddle::Tensor GetStop(paddle::Tensor& not_need_stop); + +void SetStop(paddle::Tensor& not_need_stop, bool flag); + +PYBIND11_MODULE(fastdeploy_ops, m) { + /** + * alloc_cache_pinned.cc + * cuda_host_alloc + * cuda_host_free + */ + m.def("cuda_host_alloc", + &cuda_host_alloc, + "Allocate pinned memory", + py::arg("size"), + py::arg("flags") = cudaHostAllocDefault); + m.def( + "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr")); + py::register_exception(m, "CudaError"); + + m.def("get_stop", &GetStop, "get_stop function"); + + m.def("set_stop", &SetStop, "set_stop function"); +} diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index 1213d92dca..8610511db8 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -564,6 +564,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): "gpu_ops/recover_decode_task.cu", "gpu_ops/update_inputs_v1.cu", "gpu_ops/get_img_boundaries.cc", + "gpu_ops/set_stop.cu", "iluvatar_ops/moe_dispatch.cu", "iluvatar_ops/moe_reduce.cu", "iluvatar_ops/paged_attn.cu", @@ -571,6 +572,7 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): "iluvatar_ops/mixed_fused_attn.cu", "iluvatar_ops/w8a16_group_gemm.cu", "iluvatar_ops/runtime/iluvatar_context.cc", + "iluvatar_ops/cpp_extensions.cc", ], include_dirs=["iluvatar_ops/runtime", "gpu_ops"], extra_link_args=[ diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py index 4f090297d3..f907048e46 100644 --- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py +++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py @@ -102,6 +102,7 @@ def apply_penalty_multi_scores( presence_penalties, temperature, bad_words_token_ids, + bad_words_token_len, step_idx, min_dec_lens, eos_token_ids, diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index dc8a485f7d..3b48dd41d1 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -882,6 +882,7 @@ def rebuild_padding( seq_lens_decoder, seq_lens_encoder, batch_id_per_token_output, + cu_seqlens_q_output, first_token_out, enable_logprob, ) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 9544657a58..e4ab707f62 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -52,14 +52,15 @@ from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler from fastdeploy.model_executor.model_loader import get_model_loader -from fastdeploy.model_executor.ops.gpu import get_stop, set_stop from fastdeploy.platforms import current_platform from fastdeploy.worker.input_batch import InputBatch, reorder_split_prefill_and_decode if current_platform.is_iluvatar(): from fastdeploy.model_executor.ops.iluvatar import ( + get_stop, recover_decode_task, set_data_ipc, + set_stop, set_value_by_flags_and_idx, ) @@ -71,7 +72,9 @@ elif current_platform.is_dcu(): share_external_data = None else: from fastdeploy.model_executor.ops.gpu import ( + get_stop, recover_decode_task, + set_stop, set_value_by_flags_and_idx, share_external_data, speculate_schedule_cache, diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 63c0222891..baf790d166 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -14,6 +14,8 @@ # limitations under the License. """ +from functools import partial + import paddle from fastdeploy import envs @@ -22,6 +24,24 @@ from fastdeploy.model_executor.layers.attention import IluvatarAttnBackend from fastdeploy.worker.gpu_model_runner import GPUModelRunner +def _patch_before_model_runner(): + paddle.Tensor.pin_memory = paddle.Tensor.cpu + paddle.device.cuda.create_event = partial(paddle.device.custom_device.create_event, device_type="iluvatar_gpu") + + def disable_record(self): + pass + + paddle.device.custom_device.Event.record = disable_record + + def disable_synchronize(self): + pass + + paddle.device.custom_device.Event.synchronize = disable_synchronize + + +_patch_before_model_runner() + + class IluvatarModelRunner(GPUModelRunner): def __init__( self, diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh index 712eed4b2e..7d73595b5b 100644 --- a/scripts/run_ci_iluvatar.sh +++ b/scripts/run_ci_iluvatar.sh @@ -11,24 +11,39 @@ unset http_proxy unset https_proxy unset no_proxy +export FD_LOG_DIR=/fdlog/$HOSTNAME +echo "FD log will be saved into $FD_LOG_DIR" export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 ln -sf /usr/local/bin/python3 /usr/local/bin/python echo "pip requirements" python -m pip install -r requirements_iluvatar.txt echo "install paddle cpu and custom device" -python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -python -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ -#python -m pip install paddlepaddle==3.3.0.dev20251219 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -#python -m pip install paddle-iluvatar-gpu==3.0.0.dev20251223 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +# python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +# python -m pip install --pre paddle-iluvatar-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +python -m pip install paddlepaddle==3.4.0.dev20260206 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +python -m pip install paddle-iluvatar-gpu==3.0.0.dev20260206 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ + +INCLUDE_FOLDERS=( + "ERNIE_300B_4L" + "ERNIE-4.5-21B-A3B-Paddle" + "ERNIE-4.5-VL-28B-A3B-Paddle" + "PaddleOCR-VL" +) MODEL_DIR=/model_data mkdir -p $MODEL_DIR SOURCE_DIR=/aistudio/paddle_ci -for file in "$SOURCE_DIR"/*; do +echo "ls $SOURCE_DIR" +ls $SOURCE_DIR + +for filename in "${INCLUDE_FOLDERS[@]}"; do + file=$SOURCE_DIR/$filename echo "start copy $file into $MODEL_DIR ..." cp -r $file $MODEL_DIR done + echo "copy done" +echo "ls $MODEL_DIR" ls $MODEL_DIR echo "build whl"