[Feature][MTP] Support cacheKV transfer in per_chunk mode (#2890)

* support chunk_prefill both normal and speculative_decoding(mtp) * optimize pd-disaggregation config * fix bug
2026-04-24 01:29:57 +08:00 · 2025-07-17 17:58:08 +08:00
parent 67180c1ff9
commit d49f8fb30a
10 changed files with 110 additions and 27 deletions
@@ -21,6 +21,7 @@ from .gqa_rope_write_cache import gqa_rope_write_cache
 from .init_signal_layerwise import init_signal_layerwise
 from .open_shm_and_get_meta_signal import open_shm_and_get_meta_signal
 from .pre_cache_len_concat import pre_cache_len_concat
+from .init_kv_signal_per_query import init_kv_signal_per_query

 __all__ = [
    "get_block_shape_and_split_kv_block",
@@ -29,4 +30,5 @@ __all__ = [
    "init_signal_layerwise",
    "gqa_rope_write_cache",
    "pre_cache_len_concat",
+    "init_kv_signal_per_query"
 ]
@@ -0,0 +1,37 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+
+from fastdeploy.platforms import current_platform
+
+
+def init_kv_signal_per_query(
+    seq_lens_encoder: paddle.Tensor,
+    seq_lens_this_time: paddle.Tensor,
+    seq_lens_decoder: paddle.Tensor,
+    rank: int,
+    num_layers: int,
+) -> paddle.Tensor:
+    """
+    init_kv_signal_per_query
+    """
+    if current_platform.is_cuda():
+        from fastdeploy.model_executor.ops.gpu import init_kv_signal_per_query
+        out = init_kv_signal_per_query(seq_lens_encoder, seq_lens_this_time, seq_lens_decoder, rank, num_layers)
+        return out
+    else:
+        raise NotImplementedError()