[Feature] Support KV Cache Storage (#5571)

* Support Mooncake Store * up * up * add op * fix conflict * fix error * up for comments * avoid thread lock * up * fix unittest * fix unittest * remove debug info * consider tp_size > 1 * add default rdma_nics * add utils * up * fix error --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2026-04-23 08:21:53 +08:00 · 2025-12-25 16:30:35 +08:00
parent be3be4913a
commit 412867fd99
27 changed files with 1672 additions and 195 deletions
@@ -0,0 +1,134 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include "paddle/extension.h"
+
+// #define SWAP_DEBUG
+
+template <paddle::DataType D>
+void SwapCacheImpLayout(
+    const std::vector<paddle::Tensor>& cache_gpu_tensors,  // gpu
+    const int64_t& cache_cpu_pointer,                      // cpu
+    const std::vector<int64_t>& cache_shape,
+    const std::vector<int64_t>& gpu_block_ids,
+    const std::vector<int64_t>& cpu_block_ids,
+    int mode) {
+  // mode is 0: gpu to cpu; 1: cpu to gpu
+  // cache layout: layer_num * [block_num, head_num, block_size, head_dim]
+  // buffer layout: [block_num, layer_num, head_num, block_size, head_dim]
+  typedef PDTraits<D> traits_;
+  typedef typename traits_::DataType DataType_;
+  typedef typename traits_::data_t data_t;
+
+  const int64_t layer_number = cache_gpu_tensors.size();
+  const int64_t num_heads = cache_shape[1];
+  const int64_t block_size = cache_shape[2];
+  const int64_t head_dim = cache_shape[3];
+  const int64_t cache_block_stride = num_heads * block_size * head_dim;
+
+#ifdef SWAP_DEBUG
+  std::cout << "layer_number:" << layer_number << std::endl;
+  std::cout << "cache_shape:" << cache_shape[0] << ", " << cache_shape[1]
+            << ", " << cache_shape[2] << ", " << cache_shape[3] << std::endl;
+  std::cout << "cache_block_stride:" << cache_block_stride << std::endl;
+#endif
+
+  auto stream = cache_gpu_tensors[0].stream();
+  const cudaMemcpyKind copy_kind =
+      (mode == 0) ? cudaMemcpyDeviceToHost : cudaMemcpyHostToDevice;
+
+  for (int layer_idx = 0; layer_idx < cache_gpu_tensors.size(); layer_idx++) {
+    const paddle::Tensor& cache_gpu = cache_gpu_tensors[layer_idx];
+    data_t* cache_gpu_ptr = const_cast<data_t*>(cache_gpu.data<data_t>());
+    auto* cache_cpu_ptr = reinterpret_cast<data_t*>(cache_cpu_pointer);
+    // auto stream = cache_gpu.stream();
+
+    for (int block_idx = 0; block_idx < gpu_block_ids.size(); block_idx++) {
+      auto cur_gpu_block_id = gpu_block_ids[block_idx];
+      auto cur_cpu_block_id = cpu_block_ids[block_idx];
+      auto* cache_gpu_ptr_now =
+          cache_gpu_ptr + cur_gpu_block_id * cache_block_stride;
+      auto* cache_cpu_ptr_now =
+          cache_cpu_ptr + cur_cpu_block_id * cache_block_stride * layer_number +
+          layer_idx * cache_block_stride;
+
+      cudaError_t status = cudaMemcpyAsync(
+          (copy_kind == cudaMemcpyDeviceToHost) ? cache_cpu_ptr_now
+                                                : cache_gpu_ptr_now,
+          (copy_kind == cudaMemcpyDeviceToHost) ? cache_gpu_ptr_now
+                                                : cache_cpu_ptr_now,
+          cache_block_stride * sizeof(DataType_),
+          copy_kind,
+          stream);
+
+#ifdef SWAP_DEBUG
+      cudaStreamSynchronize(stream);
+      std::cout << "mode:" << mode << ", layer_idx:" << layer_idx
+                << ", block_idx:" << block_idx << ", cache_cpu_ptr_now data:"
+                << static_cast<float>(*cache_cpu_ptr_now) << std::endl;
+#endif
+    }
+  }
+  cudaStreamSynchronize(stream);
+}
+
+void SwapCacheLayout(
+    const std::vector<paddle::Tensor>& cache_gpu_tensors,  // gpu
+    const int64_t& cache_cpu_ptrs,                         // cpu memory pointer
+    const std::vector<int64_t>& cache_shape,
+    const std::vector<int64_t>& gpu_block_ids,
+    const std::vector<int64_t>& cpu_block_ids,
+    int rank,
+    int mode) {
+  cudaSetDevice(rank);  // used for distributed launch
+  assert(cache_gpu_tensors.size() > 0);
+  switch (cache_gpu_tensors[0].dtype()) {
+    case paddle::DataType::BFLOAT16:
+      return SwapCacheImpLayout<paddle::DataType::BFLOAT16>(cache_gpu_tensors,
+                                                            cache_cpu_ptrs,
+                                                            cache_shape,
+                                                            gpu_block_ids,
+                                                            cpu_block_ids,
+                                                            mode);
+    case paddle::DataType::FLOAT16:
+      return SwapCacheImpLayout<paddle::DataType::FLOAT16>(cache_gpu_tensors,
+                                                           cache_cpu_ptrs,
+                                                           cache_shape,
+                                                           gpu_block_ids,
+                                                           cpu_block_ids,
+                                                           mode);
+    case paddle::DataType::UINT8:
+      return SwapCacheImpLayout<paddle::DataType::UINT8>(cache_gpu_tensors,
+                                                         cache_cpu_ptrs,
+                                                         cache_shape,
+                                                         gpu_block_ids,
+                                                         cpu_block_ids,
+                                                         mode);
+    default:
+      PD_THROW("Unsupported data type.");
+  }
+}
+
+PD_BUILD_STATIC_OP(swap_cache_layout)
+    .Inputs({paddle::Vec("cache_gpu_tensors")})
+    .Attrs({
+        "cache_cpu_ptrs: int64_t",
+        "cache_shape: std::vector<int64_t>",
+        "gpu_block_ids: std::vector<int64_t>",
+        "cpu_block_ids: std::vector<int64_t>",
+        "rank: int",
+        "mode: int",
+    })
+    .SetKernelFn(PD_KERNEL(SwapCacheLayout));