diff --git a/custom_ops/metax_ops/cpp_extensions.cc b/custom_ops/metax_ops/cpp_extensions.cc new file mode 100644 index 0000000000..008c3da235 --- /dev/null +++ b/custom_ops/metax_ops/cpp_extensions.cc @@ -0,0 +1,52 @@ +#include "paddle/extension.h" +#include "pybind11/pybind11.h" +namespace py = pybind11; + +// 自定义异常类,用于处理CUDA错误 +class CudaError : public std::exception { + public: + explicit CudaError(cudaError_t error) : error_(error) {} + + const char* what() const noexcept override { + return cudaGetErrorString(error_); + } + + private: + cudaError_t error_; +}; + +// 检查CUDA错误并抛出异常 +void check_cuda_error(cudaError_t error) { + if (error != cudaSuccess) { + throw CudaError(error); + } +} + +// 封装cudaHostAlloc的Python函数 +uintptr_t cuda_host_alloc(size_t size, + unsigned int flags = cudaHostAllocDefault) { + void* ptr = nullptr; + check_cuda_error(cudaHostAlloc(&ptr, size, flags)); + return reinterpret_cast(ptr); +} + +// 封装cudaFreeHost的Python函数 +void cuda_host_free(uintptr_t ptr) { + check_cuda_error(cudaFreeHost(reinterpret_cast(ptr))); +} + +PYBIND11_MODULE(fastdeploy_ops, m) { + /** + * alloc_cache_pinned.cc + * cuda_host_alloc + * cuda_host_free + */ + m.def("cuda_host_alloc", + &cuda_host_alloc, + "Allocate pinned memory", + py::arg("size"), + py::arg("flags") = cudaHostAllocDefault); + m.def( + "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr")); + py::register_exception(m, "CudaError"); +} diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index 2d01dcdb35..765de06030 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -637,12 +637,17 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"): "gpu_ops/sample_kernels/rejection_top_p_sampling.cu", "gpu_ops/sample_kernels/top_k_renorm_probs.cu", "gpu_ops/sample_kernels/min_p_sampling_from_probs.cu", + "gpu_ops/get_data_ptr_ipc.cu", + "gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu", + "gpu_ops/unset_data_ipc.cu", + "gpu_ops/swap_cache_batch.cu", "metax_ops/moe_dispatch.cu", "metax_ops/moe_ffn.cu", "metax_ops/moe_reduce.cu", "metax_ops/fused_moe.cu", "metax_ops/apply_rope_qkv.cu", "metax_ops/cache_kv_with_rope.cu", + "metax_ops/cpp_extensions.cc", ] sources += find_end_files("gpu_ops/speculate_decoding", ".cu") diff --git a/fastdeploy/cache_manager/ops.py b/fastdeploy/cache_manager/ops.py index f78f5431e3..370188d217 100644 --- a/fastdeploy/cache_manager/ops.py +++ b/fastdeploy/cache_manager/ops.py @@ -14,6 +14,8 @@ # limitations under the License. """ +import os + import paddle from fastdeploy.platforms import current_platform @@ -39,6 +41,29 @@ try: def get_peer_mem_addr(*args, **kwargs): raise RuntimeError("CUDA no need of get_peer_mem_addr!") + elif current_platform.is_maca(): + from fastdeploy.model_executor.ops.gpu import ( # get_output_kv_signal,; ipc_sent_key_value_cache_by_remote_ptr_block_sync, + cuda_host_alloc, + cuda_host_free, + get_data_ptr_ipc, + ipc_sent_key_value_cache_by_remote_ptr, + set_data_ipc, + share_external_data, + swap_cache_all_layers, + unset_data_ipc, + ) + + memory_allocated = paddle.device.memory_allocated + + def get_peer_mem_addr(*args, **kwargs): + raise RuntimeError("CUDA no need of get_peer_mem_addr!") + + def get_output_kv_signal(*args, **kwargs): + raise RuntimeError("Metax get_output_kv_signal UNIMPLENENTED!") + + def ipc_sent_key_value_cache_by_remote_ptr_block_sync(*args, **kwargs): + raise RuntimeError("Metax ipc_sent_key_value_cache_by_remote_ptr_block_sync UNIMPLENENTED!") + elif current_platform.is_xpu(): from fastdeploy.model_executor.ops.xpu import ( cuda_host_alloc, @@ -69,6 +94,8 @@ try: def set_device(device): if current_platform.is_cuda(): paddle.set_device(f"gpu:{device}") + elif current_platform.is_maca(): + paddle.set_device(f"metax_gpu:{device}") elif current_platform.is_xpu(): paddle.set_device(f"xpu:{device}") else: @@ -77,6 +104,8 @@ try: def share_external_data_(cache, cache_name, cache_shape, use_ipc): if current_platform.is_cuda(): cache = share_external_data(cache, cache_name, cache_shape) + elif current_platform.is_maca(): + cache = share_external_data(cache, cache_name, cache_shape) elif current_platform.is_xpu(): cache = share_external_data(cache, cache_name, cache_shape, use_ipc) else: @@ -86,6 +115,8 @@ try: def get_all_visible_devices(): if current_platform.is_xpu(): return "XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" + elif current_platform.is_maca(): + return f'MACA_VISIBLE_DEVICES={os.environ.get("MACA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7")}' else: return "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 44520220ad..b7d7f4d807 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -531,7 +531,12 @@ class EngineArgs: self.tokenizer = self.model if self.splitwise_role == "decode": self.enable_prefix_caching = False - if not current_platform.is_cuda() and not current_platform.is_xpu() and not current_platform.is_intel_hpu(): + if ( + not current_platform.is_cuda() + and not current_platform.is_xpu() + and not current_platform.is_intel_hpu() + and not current_platform.is_maca() + ): self.enable_prefix_caching = False if self.enable_logprob: if not current_platform.is_cuda() and not current_platform.is_xpu():