diff --git a/custom_ops/metax_ops/cpp_extensions.cc b/custom_ops/metax_ops/cpp_extensions.cc
new file mode 100644
index 0000000000..008c3da235
--- /dev/null
+++ b/custom_ops/metax_ops/cpp_extensions.cc
@@ -0,0 +1,52 @@
+#include "paddle/extension.h"
+#include "pybind11/pybind11.h"
+namespace py = pybind11;
+
+// 自定义异常类，用于处理CUDA错误
+class CudaError : public std::exception {
+ public:
+  explicit CudaError(cudaError_t error) : error_(error) {}
+
+  const char* what() const noexcept override {
+    return cudaGetErrorString(error_);
+  }
+
+ private:
+  cudaError_t error_;
+};
+
+// 检查CUDA错误并抛出异常
+void check_cuda_error(cudaError_t error) {
+  if (error != cudaSuccess) {
+    throw CudaError(error);
+  }
+}
+
+// 封装cudaHostAlloc的Python函数
+uintptr_t cuda_host_alloc(size_t size,
+                          unsigned int flags = cudaHostAllocDefault) {
+  void* ptr = nullptr;
+  check_cuda_error(cudaHostAlloc(&ptr, size, flags));
+  return reinterpret_cast<uintptr_t>(ptr);
+}
+
+// 封装cudaFreeHost的Python函数
+void cuda_host_free(uintptr_t ptr) {
+  check_cuda_error(cudaFreeHost(reinterpret_cast<void*>(ptr)));
+}
+
+PYBIND11_MODULE(fastdeploy_ops, m) {
+  /**
+   * alloc_cache_pinned.cc
+   * cuda_host_alloc
+   * cuda_host_free
+   */
+  m.def("cuda_host_alloc",
+        &cuda_host_alloc,
+        "Allocate pinned memory",
+        py::arg("size"),
+        py::arg("flags") = cudaHostAllocDefault);
+  m.def(
+      "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
+  py::register_exception<CudaError>(m, "CudaError");
+}
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 2d01dcdb35..765de06030 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -637,12 +637,17 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
         "gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
         "gpu_ops/sample_kernels/top_k_renorm_probs.cu",
         "gpu_ops/sample_kernels/min_p_sampling_from_probs.cu",
+        "gpu_ops/get_data_ptr_ipc.cu",
+        "gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu",
+        "gpu_ops/unset_data_ipc.cu",
+        "gpu_ops/swap_cache_batch.cu",
         "metax_ops/moe_dispatch.cu",
         "metax_ops/moe_ffn.cu",
         "metax_ops/moe_reduce.cu",
         "metax_ops/fused_moe.cu",
         "metax_ops/apply_rope_qkv.cu",
         "metax_ops/cache_kv_with_rope.cu",
+        "metax_ops/cpp_extensions.cc",
     ]
 
     sources += find_end_files("gpu_ops/speculate_decoding", ".cu")
diff --git a/fastdeploy/cache_manager/ops.py b/fastdeploy/cache_manager/ops.py
index f78f5431e3..370188d217 100644
--- a/fastdeploy/cache_manager/ops.py
+++ b/fastdeploy/cache_manager/ops.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """
 
+import os
+
 import paddle
 
 from fastdeploy.platforms import current_platform
@@ -39,6 +41,29 @@ try:
         def get_peer_mem_addr(*args, **kwargs):
             raise RuntimeError("CUDA no need of get_peer_mem_addr!")
 
+    elif current_platform.is_maca():
+        from fastdeploy.model_executor.ops.gpu import (  # get_output_kv_signal,; ipc_sent_key_value_cache_by_remote_ptr_block_sync,
+            cuda_host_alloc,
+            cuda_host_free,
+            get_data_ptr_ipc,
+            ipc_sent_key_value_cache_by_remote_ptr,
+            set_data_ipc,
+            share_external_data,
+            swap_cache_all_layers,
+            unset_data_ipc,
+        )
+
+        memory_allocated = paddle.device.memory_allocated
+
+        def get_peer_mem_addr(*args, **kwargs):
+            raise RuntimeError("CUDA no need of get_peer_mem_addr!")
+
+        def get_output_kv_signal(*args, **kwargs):
+            raise RuntimeError("Metax get_output_kv_signal UNIMPLENENTED!")
+
+        def ipc_sent_key_value_cache_by_remote_ptr_block_sync(*args, **kwargs):
+            raise RuntimeError("Metax ipc_sent_key_value_cache_by_remote_ptr_block_sync UNIMPLENENTED!")
+
     elif current_platform.is_xpu():
         from fastdeploy.model_executor.ops.xpu import (
             cuda_host_alloc,
@@ -69,6 +94,8 @@ try:
     def set_device(device):
         if current_platform.is_cuda():
             paddle.set_device(f"gpu:{device}")
+        elif current_platform.is_maca():
+            paddle.set_device(f"metax_gpu:{device}")
         elif current_platform.is_xpu():
             paddle.set_device(f"xpu:{device}")
         else:
@@ -77,6 +104,8 @@ try:
     def share_external_data_(cache, cache_name, cache_shape, use_ipc):
         if current_platform.is_cuda():
             cache = share_external_data(cache, cache_name, cache_shape)
+        elif current_platform.is_maca():
+            cache = share_external_data(cache, cache_name, cache_shape)
         elif current_platform.is_xpu():
             cache = share_external_data(cache, cache_name, cache_shape, use_ipc)
         else:
@@ -86,6 +115,8 @@ try:
     def get_all_visible_devices():
         if current_platform.is_xpu():
             return "XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
+        elif current_platform.is_maca():
+            return f'MACA_VISIBLE_DEVICES={os.environ.get("MACA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7")}'
         else:
             return "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
 
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index 44520220ad..b7d7f4d807 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -531,7 +531,12 @@ class EngineArgs:
             self.tokenizer = self.model
         if self.splitwise_role == "decode":
             self.enable_prefix_caching = False
-        if not current_platform.is_cuda() and not current_platform.is_xpu() and not current_platform.is_intel_hpu():
+        if (
+            not current_platform.is_cuda()
+            and not current_platform.is_xpu()
+            and not current_platform.is_intel_hpu()
+            and not current_platform.is_maca()
+        ):
             self.enable_prefix_caching = False
         if self.enable_logprob:
             if not current_platform.is_cuda() and not current_platform.is_xpu():