[feat] support fa3 backend for pd disaggregated (#2695)
Deploy GitHub Pages / deploy (push) Has been cancelled

* support fa3 backend run in pd disaggregated

* support fa3 backend run in pd disaggregated

* support fa3 backend run in pd disaggregated

* support fa3 backend run in pd disaggregated

* delete use_fast_ffn
This commit is contained in:
Yuanle Liu
2025-07-03 22:33:27 +08:00
committed by GitHub
parent 00863c43fd
commit 240bdac2a4
26 changed files with 455 additions and 139 deletions
@@ -17,10 +17,16 @@
from .append_attention import append_attention
from .get_block_shape_and_split_kv_block import \
get_block_shape_and_split_kv_block
from .gqa_rope_write_cache import gqa_rope_write_cache
from .init_signal_layerwise import init_signal_layerwise
from .open_shm_and_get_meta_signal import open_shm_and_get_meta_signal
from .pre_cache_len_concat import pre_cache_len_concat
__all__ = [
"get_block_shape_and_split_kv_block", "append_attention",
"open_shm_and_get_meta_signal", "init_signal_layerwise"
"get_block_shape_and_split_kv_block",
"append_attention",
"open_shm_and_get_meta_signal",
"init_signal_layerwise",
"gqa_rope_write_cache",
"pre_cache_len_concat",
]
@@ -0,0 +1,66 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
import paddle
from fastdeploy.platforms import current_platform
def gqa_rope_write_cache(
qkv: paddle.Tensor,
key_cache: paddle.Tensor,
value_cache: paddle.Tensor,
cu_seqlens_q: paddle.Tensor,
cu_seqlens_k: paddle.Tensor,
rotary_embs: paddle.Tensor,
seq_lens_this_time: paddle.Tensor,
seq_lens_encoder: paddle.Tensor,
seq_lens_decoder: paddle.Tensor,
padding_offsets: paddle.Tensor,
cum_offsets: paddle.Tensor,
block_tables: paddle.Tensor,
kv_batch_ids: paddle.Tensor,
kv_tile_ids_per_batch: paddle.Tensor,
kv_num_blocks: paddle.Tensor,
cache_batch_ids: paddle.Tensor,
cache_tile_ids_per_batch: paddle.Tensor,
cache_num_blocks: paddle.Tensor,
cache_k_quant_scales: Optional[paddle.Tensor] = None,
cache_v_quant_scales: Optional[paddle.Tensor] = None,
cache_k_dequant_scales: Optional[paddle.Tensor] = None,
cache_v_dequant_scales: Optional[paddle.Tensor] = None,
cache_k_zp: Optional[paddle.Tensor] = None,
cache_v_zp: Optional[paddle.Tensor] = None,
kv_signal_data: Optional[paddle.Tensor] = None,
kv_token_num: int = 1,
max_seq_len: int = 0,
cache_quant_type: str = "none"):
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import gqa_rope_write_cache
q, k, v, qkv_ = gqa_rope_write_cache(
qkv, key_cache, value_cache, cu_seqlens_q, cu_seqlens_k,
rotary_embs, seq_lens_this_time, seq_lens_encoder,
seq_lens_decoder, padding_offsets, cum_offsets, block_tables,
kv_batch_ids, kv_tile_ids_per_batch, kv_num_blocks,
cache_batch_ids, cache_tile_ids_per_batch, cache_num_blocks,
cache_k_quant_scales, cache_v_quant_scales, cache_k_dequant_scales,
cache_v_dequant_scales, cache_k_zp, cache_v_zp, kv_signal_data,
kv_token_num, max_seq_len, cache_quant_type)
return q, k, v, qkv_
else:
raise NotImplementedError()
@@ -0,0 +1,36 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License,
Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from fastdeploy.platforms import current_platform
def pre_cache_len_concat(seq_lens_decoder: paddle.Tensor,
seq_lens_this_time: paddle.Tensor,
max_dec_len: int = 0,
block_size: int = 64):
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import pre_cache_len_concat
out = pre_cache_len_concat(seq_lens_decoder, seq_lens_this_time,
max_dec_len, block_size)
return out
else:
raise NotImplementedError()