mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-25 09:57:51 +08:00
[feat] support fa3 backend for pd disaggregated (#2695)
Deploy GitHub Pages / deploy (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * support fa3 backend run in pd disaggregated * delete use_fast_ffn
This commit is contained in:
@@ -17,10 +17,16 @@
|
||||
from .append_attention import append_attention
|
||||
from .get_block_shape_and_split_kv_block import \
|
||||
get_block_shape_and_split_kv_block
|
||||
from .gqa_rope_write_cache import gqa_rope_write_cache
|
||||
from .init_signal_layerwise import init_signal_layerwise
|
||||
from .open_shm_and_get_meta_signal import open_shm_and_get_meta_signal
|
||||
from .pre_cache_len_concat import pre_cache_len_concat
|
||||
|
||||
__all__ = [
|
||||
"get_block_shape_and_split_kv_block", "append_attention",
|
||||
"open_shm_and_get_meta_signal", "init_signal_layerwise"
|
||||
"get_block_shape_and_split_kv_block",
|
||||
"append_attention",
|
||||
"open_shm_and_get_meta_signal",
|
||||
"init_signal_layerwise",
|
||||
"gqa_rope_write_cache",
|
||||
"pre_cache_len_concat",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import paddle
|
||||
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
|
||||
def gqa_rope_write_cache(
|
||||
qkv: paddle.Tensor,
|
||||
key_cache: paddle.Tensor,
|
||||
value_cache: paddle.Tensor,
|
||||
cu_seqlens_q: paddle.Tensor,
|
||||
cu_seqlens_k: paddle.Tensor,
|
||||
rotary_embs: paddle.Tensor,
|
||||
seq_lens_this_time: paddle.Tensor,
|
||||
seq_lens_encoder: paddle.Tensor,
|
||||
seq_lens_decoder: paddle.Tensor,
|
||||
padding_offsets: paddle.Tensor,
|
||||
cum_offsets: paddle.Tensor,
|
||||
block_tables: paddle.Tensor,
|
||||
kv_batch_ids: paddle.Tensor,
|
||||
kv_tile_ids_per_batch: paddle.Tensor,
|
||||
kv_num_blocks: paddle.Tensor,
|
||||
cache_batch_ids: paddle.Tensor,
|
||||
cache_tile_ids_per_batch: paddle.Tensor,
|
||||
cache_num_blocks: paddle.Tensor,
|
||||
cache_k_quant_scales: Optional[paddle.Tensor] = None,
|
||||
cache_v_quant_scales: Optional[paddle.Tensor] = None,
|
||||
cache_k_dequant_scales: Optional[paddle.Tensor] = None,
|
||||
cache_v_dequant_scales: Optional[paddle.Tensor] = None,
|
||||
cache_k_zp: Optional[paddle.Tensor] = None,
|
||||
cache_v_zp: Optional[paddle.Tensor] = None,
|
||||
kv_signal_data: Optional[paddle.Tensor] = None,
|
||||
kv_token_num: int = 1,
|
||||
max_seq_len: int = 0,
|
||||
cache_quant_type: str = "none"):
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.model_executor.ops.gpu import gqa_rope_write_cache
|
||||
q, k, v, qkv_ = gqa_rope_write_cache(
|
||||
qkv, key_cache, value_cache, cu_seqlens_q, cu_seqlens_k,
|
||||
rotary_embs, seq_lens_this_time, seq_lens_encoder,
|
||||
seq_lens_decoder, padding_offsets, cum_offsets, block_tables,
|
||||
kv_batch_ids, kv_tile_ids_per_batch, kv_num_blocks,
|
||||
cache_batch_ids, cache_tile_ids_per_batch, cache_num_blocks,
|
||||
cache_k_quant_scales, cache_v_quant_scales, cache_k_dequant_scales,
|
||||
cache_v_dequant_scales, cache_k_zp, cache_v_zp, kv_signal_data,
|
||||
kv_token_num, max_seq_len, cache_quant_type)
|
||||
return q, k, v, qkv_
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License,
|
||||
Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
|
||||
def pre_cache_len_concat(seq_lens_decoder: paddle.Tensor,
|
||||
seq_lens_this_time: paddle.Tensor,
|
||||
max_dec_len: int = 0,
|
||||
block_size: int = 64):
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.model_executor.ops.gpu import pre_cache_len_concat
|
||||
out = pre_cache_len_concat(seq_lens_decoder, seq_lens_this_time,
|
||||
max_dec_len, block_size)
|
||||
return out
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
Reference in New Issue
Block a user