mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
@@ -42,13 +42,10 @@ struct softmax_state_t {
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ softmax_state_t() {
|
||||
init();
|
||||
}
|
||||
__device__ __forceinline__ softmax_state_t() { init(); }
|
||||
|
||||
__device__ __forceinline__ void merge(const AlignedVector<T, vec_size>& other_o,
|
||||
T other_m,
|
||||
T other_d) {
|
||||
__device__ __forceinline__ void merge(
|
||||
const AlignedVector<T, vec_size>& other_o, T other_m, T other_d) {
|
||||
// using kType = typename cascade_attn_nv_type2_traits<T>::type;
|
||||
T m_prev = m, d_prev = d;
|
||||
m = m_prev > other_m ? m_prev : other_m;
|
||||
@@ -63,13 +60,11 @@ struct softmax_state_t {
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void normalize() {
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < vec_size; ++i) {
|
||||
o[i] /= d;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <size_t vec_size, typename T, uint32_t num_tiles = 0>
|
||||
@@ -102,65 +97,79 @@ struct softmax_state_ts {
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ softmax_state_ts() {
|
||||
init();
|
||||
}
|
||||
__device__ __forceinline__ softmax_state_ts() { init(); }
|
||||
|
||||
__device__ __forceinline__ void normalize(const uint32_t tile_id) {
|
||||
|
||||
#pragma unroll
|
||||
for (size_t i = 0; i < vec_size; i++) {
|
||||
o[tile_id][i] /= d;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <SharedMemFillMode fill_mode, uint32_t HEAD_DIM_QK, uint32_t vec_size, uint32_t NUM_VEC_PER_HEAD, uint32_t bdx, uint32_t BLOCK_SIZE, uint32_t CACHE_VEC_SIZE, typename CacheT>
|
||||
__device__ __forceinline__ void produce_kv(CacheT *smem,
|
||||
CacheT *kv_base_gptr,
|
||||
const int * block_table_smem,
|
||||
const uint32_t seq_offset_gmem,
|
||||
const uint32_t seq_offset_smem,
|
||||
const uint32_t kv_head_idx,
|
||||
const uint32_t kv_num_heads,
|
||||
const uint32_t tidx,
|
||||
const uint32_t chunk_start,
|
||||
const uint32_t chunk_end) {
|
||||
template <SharedMemFillMode fill_mode,
|
||||
uint32_t HEAD_DIM_QK,
|
||||
uint32_t vec_size,
|
||||
uint32_t NUM_VEC_PER_HEAD,
|
||||
uint32_t bdx,
|
||||
uint32_t BLOCK_SIZE,
|
||||
uint32_t CACHE_VEC_SIZE,
|
||||
typename CacheT>
|
||||
__device__ __forceinline__ void produce_kv(CacheT* smem,
|
||||
CacheT* kv_base_gptr,
|
||||
const int* block_table_smem,
|
||||
const uint32_t seq_offset_gmem,
|
||||
const uint32_t seq_offset_smem,
|
||||
const uint32_t kv_head_idx,
|
||||
const uint32_t kv_num_heads,
|
||||
const uint32_t tidx,
|
||||
const uint32_t chunk_start,
|
||||
const uint32_t chunk_end) {
|
||||
int block_id = __ldg(&block_table_smem[seq_offset_gmem / BLOCK_SIZE]);
|
||||
if (block_id < 0) {
|
||||
block_id = 0;
|
||||
}
|
||||
const uint32_t block_offset = seq_offset_gmem % BLOCK_SIZE;
|
||||
// 8/16 T/int8 each time
|
||||
const uint32_t k_offset_base = ((block_id * kv_num_heads + kv_head_idx) * BLOCK_SIZE + block_offset) * HEAD_DIM_QK;
|
||||
const uint32_t k_offset_base =
|
||||
((block_id * kv_num_heads + kv_head_idx) * BLOCK_SIZE + block_offset) *
|
||||
HEAD_DIM_QK;
|
||||
const uint32_t smem_offset_base = seq_offset_smem * HEAD_DIM_QK;
|
||||
for(uint32_t vid = tidx; vid < NUM_VEC_PER_HEAD; vid += bdx) {
|
||||
for (uint32_t vid = tidx; vid < NUM_VEC_PER_HEAD; vid += bdx) {
|
||||
pred_load<128, PrefetchMode::kPrefetch, fill_mode, CacheT>(
|
||||
smem + smem_offset_base + vid * CACHE_VEC_SIZE,
|
||||
kv_base_gptr + k_offset_base + vid * CACHE_VEC_SIZE,
|
||||
seq_offset_gmem < chunk_end
|
||||
);
|
||||
smem + smem_offset_base + vid * CACHE_VEC_SIZE,
|
||||
kv_base_gptr + k_offset_base + vid * CACHE_VEC_SIZE,
|
||||
seq_offset_gmem < chunk_end);
|
||||
}
|
||||
}
|
||||
|
||||
template <uint32_t vec_size, uint32_t NUM_VEC_PER_HEAD, uint32_t bdx, uint32_t bdy, uint32_t HEAD_DIM, uint32_t DEAL_EACH_TIME, uint32_t num_tile_v, typename T, typename CacheT>
|
||||
__device__ __forceinline__ void compute_qk(const T* cu_q_smem,
|
||||
const CacheT* k_smem,
|
||||
const uint32_t kv_idx_base,
|
||||
const uint32_t stage_idx,
|
||||
const uint32_t iter_base,
|
||||
const uint32_t iter_bound,
|
||||
const uint32_t tidx,
|
||||
const uint32_t gid,
|
||||
const float scale,
|
||||
float *s,
|
||||
softmax_state_ts<vec_size, T, num_tile_v>& st) {
|
||||
template <uint32_t vec_size,
|
||||
uint32_t NUM_VEC_PER_HEAD,
|
||||
uint32_t bdx,
|
||||
uint32_t bdy,
|
||||
uint32_t HEAD_DIM,
|
||||
uint32_t DEAL_EACH_TIME,
|
||||
uint32_t num_tile_v,
|
||||
typename T,
|
||||
typename CacheT>
|
||||
__device__ __forceinline__ void compute_qk(
|
||||
const T* cu_q_smem,
|
||||
const CacheT* k_smem,
|
||||
const uint32_t kv_idx_base,
|
||||
const uint32_t stage_idx,
|
||||
const uint32_t iter_base,
|
||||
const uint32_t iter_bound,
|
||||
const uint32_t tidx,
|
||||
const uint32_t gid,
|
||||
const float scale,
|
||||
float* s,
|
||||
softmax_state_ts<vec_size, T, num_tile_v>& st) {
|
||||
const CacheT* smem;
|
||||
AlignedVector<T, vec_size> q_vec;
|
||||
AlignedVector<T, vec_size> k_vec;
|
||||
float m_prev = st.m;
|
||||
// smem = base_smem + (stage_idx * DEAL_EACH_TIME + zid * tile_size) * HEAD_DIM;
|
||||
// smem = base_smem + (stage_idx * DEAL_EACH_TIME + zid * tile_size) *
|
||||
// HEAD_DIM;
|
||||
smem = k_smem + stage_idx * DEAL_EACH_TIME * HEAD_DIM;
|
||||
#pragma unroll
|
||||
for (uint32_t j = 0; j < DEAL_EACH_TIME; ++j) {
|
||||
@@ -171,7 +180,7 @@ __device__ __forceinline__ void compute_qk(const T* cu_q_smem,
|
||||
s[j] = 0.f;
|
||||
}
|
||||
#pragma unroll
|
||||
for(uint32_t vid = tidx; vid < NUM_VEC_PER_HEAD; vid += bdx) {
|
||||
for (uint32_t vid = tidx; vid < NUM_VEC_PER_HEAD; vid += bdx) {
|
||||
Load<T, vec_size>(cu_q_smem + vid * vec_size, &q_vec);
|
||||
Load<CacheT, vec_size>(smem + j * HEAD_DIM + vid * vec_size, &k_vec);
|
||||
for (uint32_t i = 0; i < vec_size; ++i) {
|
||||
@@ -211,20 +220,29 @@ __device__ __forceinline__ void compute_qk(const T* cu_q_smem,
|
||||
}
|
||||
}
|
||||
|
||||
template<uint32_t vec_size, uint32_t NUM_VEC_PER_HEAD, uint32_t bdx, uint32_t DEAL_EACH_TIME, uint32_t HEAD_DIM_QK, uint32_t num_tile, typename T, typename CacheT>
|
||||
__device__ __forceinline__ void compute_sv(const float *s,
|
||||
const CacheT *base_v_smem,
|
||||
const uint32_t stage_idx,
|
||||
const uint32_t iter_base,
|
||||
const uint32_t iter_bound,
|
||||
const uint32_t tidx,
|
||||
softmax_state_ts<vec_size, T, num_tile>& st) {
|
||||
template <uint32_t vec_size,
|
||||
uint32_t NUM_VEC_PER_HEAD,
|
||||
uint32_t bdx,
|
||||
uint32_t DEAL_EACH_TIME,
|
||||
uint32_t HEAD_DIM_QK,
|
||||
uint32_t num_tile,
|
||||
typename T,
|
||||
typename CacheT>
|
||||
__device__ __forceinline__ void compute_sv(
|
||||
const float* s,
|
||||
const CacheT* base_v_smem,
|
||||
const uint32_t stage_idx,
|
||||
const uint32_t iter_base,
|
||||
const uint32_t iter_bound,
|
||||
const uint32_t tidx,
|
||||
softmax_state_ts<vec_size, T, num_tile>& st) {
|
||||
const CacheT* v_smem;
|
||||
AlignedVector<T, vec_size> v_vec;
|
||||
#pragma unroll
|
||||
for (int j = 0; (j < DEAL_EACH_TIME) && (iter_base + j < iter_bound); ++j) {
|
||||
v_smem = base_v_smem + stage_idx * DEAL_EACH_TIME * HEAD_DIM_QK + j * HEAD_DIM_QK;
|
||||
for(uint32_t vid = tidx; vid < NUM_VEC_PER_HEAD; vid += bdx) {
|
||||
v_smem = base_v_smem + stage_idx * DEAL_EACH_TIME * HEAD_DIM_QK +
|
||||
j * HEAD_DIM_QK;
|
||||
for (uint32_t vid = tidx; vid < NUM_VEC_PER_HEAD; vid += bdx) {
|
||||
Load<T, vec_size>(v_smem + vid * vec_size, &v_vec);
|
||||
uint32_t tile_id = vid / bdx;
|
||||
#pragma unroll
|
||||
|
||||
@@ -41,4 +41,5 @@ void DecoderWriteCacheWithRoPEKernel(
|
||||
paddle::Tensor* key_cache_out,
|
||||
paddle::Tensor* value_cache_out,
|
||||
const paddle::optional<paddle::Tensor>& q_norm_weight,
|
||||
const paddle::optional<paddle::Tensor>& k_norm_weight, const float rms_norm_eps);
|
||||
const paddle::optional<paddle::Tensor>& k_norm_weight,
|
||||
const float rms_norm_eps);
|
||||
|
||||
@@ -56,46 +56,53 @@ void EncoderWriteCacheWithRopeKernel(
|
||||
auto head_dim = meta_data.head_dims;
|
||||
bool is_scale_channel_wise = false;
|
||||
int rotary_dim = head_dim;
|
||||
if (cache_k_scale && cache_k_scale.get().dims()[0] == head_dim * kv_num_heads) {
|
||||
if (cache_k_scale &&
|
||||
cache_k_scale.get().dims()[0] == head_dim * kv_num_heads) {
|
||||
is_scale_channel_wise = true;
|
||||
}
|
||||
if (rotary_embs){
|
||||
rotary_dim = rotary_embs.get().dims()[rotary_embs.get().dims().size()-1] * 2;
|
||||
if(rotary_dim < head_dim){
|
||||
if (!use_neox_style || q_norm_weight || k_norm_weight || num_heads == kv_num_heads || is_scale_channel_wise){
|
||||
if (rotary_embs) {
|
||||
rotary_dim =
|
||||
rotary_embs.get().dims()[rotary_embs.get().dims().size() - 1] * 2;
|
||||
if (rotary_dim < head_dim) {
|
||||
if (!use_neox_style || q_norm_weight || k_norm_weight ||
|
||||
num_heads == kv_num_heads || is_scale_channel_wise) {
|
||||
PADDLE_THROW(phi::errors::Fatal(
|
||||
"partial_rotary_factor < 1.0 only supports use_neox_rotary_style=True, q_norm_weight/k_norm_weight) is None, GQA and is_scale_channel_wise=false."));
|
||||
"partial_rotary_factor < 1.0 only supports "
|
||||
"use_neox_rotary_style=True, q_norm_weight/k_norm_weight) is None, "
|
||||
"GQA and is_scale_channel_wise=false."));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (q_norm_weight && k_norm_weight) {
|
||||
if (num_heads != kv_num_heads && !is_scale_channel_wise && !use_neox_style) {
|
||||
if (num_heads != kv_num_heads && !is_scale_channel_wise &&
|
||||
!use_neox_style) {
|
||||
gqa_rotary_qk_norm_variable(
|
||||
qkv_out->data<T>(),
|
||||
qkv.data<QKV_TYPE>(),
|
||||
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
|
||||
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
|
||||
rotary_embs.get().data<float>(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
token_num,
|
||||
num_heads,
|
||||
kv_num_heads,
|
||||
max_seq_len,
|
||||
rope_3d ? rotary_embs.get().dims()[3] : rotary_embs.get().dims()[2],
|
||||
head_dim,
|
||||
stream,
|
||||
use_neox_style,
|
||||
rope_3d,
|
||||
q_norm_weight ? q_norm_weight.get().data<float>() : nullptr,
|
||||
k_norm_weight ? k_norm_weight.get().data<float>() : nullptr,
|
||||
rms_norm_eps);
|
||||
qkv_out->data<T>(),
|
||||
qkv.data<QKV_TYPE>(),
|
||||
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
|
||||
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
|
||||
rotary_embs.get().data<float>(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
token_num,
|
||||
num_heads,
|
||||
kv_num_heads,
|
||||
max_seq_len,
|
||||
rope_3d ? rotary_embs.get().dims()[3] : rotary_embs.get().dims()[2],
|
||||
head_dim,
|
||||
stream,
|
||||
use_neox_style,
|
||||
rope_3d,
|
||||
q_norm_weight ? q_norm_weight.get().data<float>() : nullptr,
|
||||
k_norm_weight ? k_norm_weight.get().data<float>() : nullptr,
|
||||
rms_norm_eps);
|
||||
} else {
|
||||
PD_THROW(
|
||||
"gqa_rotary_qk_norm_variable only support gqa mode. channel wise scale and neox style are not supported");
|
||||
"gqa_rotary_qk_norm_variable only support gqa mode. channel wise "
|
||||
"scale and neox style are not supported");
|
||||
}
|
||||
} else {
|
||||
if (num_heads == kv_num_heads) {
|
||||
@@ -120,49 +127,48 @@ void EncoderWriteCacheWithRopeKernel(
|
||||
} else {
|
||||
if (!is_scale_channel_wise) {
|
||||
gqa_rotary_qk_variable(
|
||||
qkv_out->data<T>(),
|
||||
qkv.data<QKV_TYPE>(),
|
||||
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
|
||||
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
|
||||
rotary_embs.get().data<float>(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
token_num,
|
||||
num_heads,
|
||||
kv_num_heads,
|
||||
max_seq_len,
|
||||
rope_3d ? rotary_embs.get().dims()[3] : rotary_embs.get().dims()[2],
|
||||
head_dim,
|
||||
rotary_dim,
|
||||
stream,
|
||||
use_neox_style,
|
||||
rope_3d);
|
||||
qkv_out->data<T>(),
|
||||
qkv.data<QKV_TYPE>(),
|
||||
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
|
||||
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
|
||||
rotary_embs.get().data<float>(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
token_num,
|
||||
num_heads,
|
||||
kv_num_heads,
|
||||
max_seq_len,
|
||||
rope_3d ? rotary_embs.get().dims()[3] : rotary_embs.get().dims()[2],
|
||||
head_dim,
|
||||
rotary_dim,
|
||||
stream,
|
||||
use_neox_style,
|
||||
rope_3d);
|
||||
} else {
|
||||
gqa_rotary_qk_quant_variable(
|
||||
qkv_out->data<T>(),
|
||||
qkv.data<QKV_TYPE>(),
|
||||
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
|
||||
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
|
||||
cache_k_scale ? cache_k_scale.get().data<T>() : nullptr,
|
||||
cache_v_scale ? cache_v_scale.get().data<T>() : nullptr,
|
||||
rotary_embs.get().data<float>(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
token_num,
|
||||
num_heads,
|
||||
kv_num_heads,
|
||||
max_seq_len,
|
||||
rotary_embs.get().dims()[2],
|
||||
head_dim,
|
||||
stream,
|
||||
use_neox_style,
|
||||
rope_3d);
|
||||
qkv_out->data<T>(),
|
||||
qkv.data<QKV_TYPE>(),
|
||||
qkv_out_scales ? qkv_out_scales.get().data<float>() : nullptr,
|
||||
qkv_biases ? qkv_biases.get().data<T>() : nullptr,
|
||||
cache_k_scale ? cache_k_scale.get().data<T>() : nullptr,
|
||||
cache_v_scale ? cache_v_scale.get().data<T>() : nullptr,
|
||||
rotary_embs.get().data<float>(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
seq_lens_decoder.data<int>(),
|
||||
token_num,
|
||||
num_heads,
|
||||
kv_num_heads,
|
||||
max_seq_len,
|
||||
rotary_embs.get().dims()[2],
|
||||
head_dim,
|
||||
stream,
|
||||
use_neox_style,
|
||||
rope_3d);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
const uint32_t block_size = meta_data.block_size;
|
||||
@@ -178,7 +184,9 @@ void EncoderWriteCacheWithRopeKernel(
|
||||
stream,
|
||||
key_cache_out,
|
||||
value_cache_out);
|
||||
} else if (cache_quant_type_str == "cache_int8" or cache_quant_type_str == "cache_fp8" or cache_quant_type_str == "block_wise_fp8") {
|
||||
} else if (cache_quant_type_str == "cache_int8" or
|
||||
cache_quant_type_str == "cache_fp8" or
|
||||
cache_quant_type_str == "block_wise_fp8") {
|
||||
DISPATCH_HEAD_DIM(
|
||||
head_dim, HEAD_DIM, {DISPATCH_BLOCK_SIZE(block_size, BLOCK_SIZE, {
|
||||
CascadeAppendWriteCacheKVC8QKV<T, HEAD_DIM, BLOCK_SIZE>(
|
||||
@@ -234,23 +242,29 @@ void EncoderWriteCacheWithRopeKernel(
|
||||
"cache_int4_zp]");
|
||||
}
|
||||
|
||||
const char* fmt_write_cache_completed_signal_str = std::getenv("FLAGS_fmt_write_cache_completed_signal");
|
||||
const char* FLAGS_use_pd_disaggregation_per_chunk = std::getenv("FLAGS_use_pd_disaggregation_per_chunk");
|
||||
const char* fmt_write_cache_completed_signal_str =
|
||||
std::getenv("FLAGS_fmt_write_cache_completed_signal");
|
||||
const char* FLAGS_use_pd_disaggregation_per_chunk =
|
||||
std::getenv("FLAGS_use_pd_disaggregation_per_chunk");
|
||||
if (fmt_write_cache_completed_signal_str &&
|
||||
(std::strcmp(fmt_write_cache_completed_signal_str, "true") == 0 ||
|
||||
std::strcmp(fmt_write_cache_completed_signal_str, "1") == 0)) {
|
||||
if (FLAGS_use_pd_disaggregation_per_chunk &&
|
||||
(std::strcmp(FLAGS_use_pd_disaggregation_per_chunk, "true") == 0 ||
|
||||
std::strcmp(FLAGS_use_pd_disaggregation_per_chunk, "1") == 0)) {
|
||||
cudaLaunchHostFunc(qkv.stream(),
|
||||
&(RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_per_query),
|
||||
(void*)nullptr);
|
||||
} else {
|
||||
if (kv_signal_data) {
|
||||
cudaLaunchHostFunc(qkv.stream(),
|
||||
&RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise,
|
||||
(void*)(const_cast<int64_t*>(kv_signal_data.get().data<int64_t>())));
|
||||
}
|
||||
if (FLAGS_use_pd_disaggregation_per_chunk &&
|
||||
(std::strcmp(FLAGS_use_pd_disaggregation_per_chunk, "true") == 0 ||
|
||||
std::strcmp(FLAGS_use_pd_disaggregation_per_chunk, "1") == 0)) {
|
||||
cudaLaunchHostFunc(
|
||||
qkv.stream(),
|
||||
&(RemoteCacheKvIpc::
|
||||
save_cache_kv_complete_signal_layerwise_per_query),
|
||||
(void*)nullptr);
|
||||
} else {
|
||||
if (kv_signal_data) {
|
||||
cudaLaunchHostFunc(
|
||||
qkv.stream(),
|
||||
&RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise,
|
||||
(void*)(const_cast<int64_t*>(
|
||||
kv_signal_data.get().data<int64_t>())));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,10 +66,10 @@ __device__ __forceinline__ void load_128b(T* smem_ptr, const T* gmem_ptr) {
|
||||
#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
|
||||
if constexpr (prefetch_mode == PrefetchMode::kPrefetch) {
|
||||
memset(__cvta_shared_to_generic(smem_int_ptr), 0, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, 16);
|
||||
} else {
|
||||
memset(__cvta_shared_to_generic(smem_int_ptr), 0, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, 16);
|
||||
}
|
||||
#else
|
||||
if constexpr (prefetch_mode == PrefetchMode::kPrefetch) {
|
||||
@@ -100,19 +100,23 @@ __device__ __forceinline__ void pred_load_128b(T* smem_ptr,
|
||||
int src_in_bytes = predicate ? 16 : 0;
|
||||
if constexpr (prefetch_mode == PrefetchMode::kPrefetch) {
|
||||
memset(__cvta_shared_to_generic(smem_int_ptr), 0, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, src_in_bytes);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr),
|
||||
(void*)gmem_ptr,
|
||||
src_in_bytes);
|
||||
} else {
|
||||
memset(__cvta_shared_to_generic(smem_int_ptr), 0, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, src_in_bytes);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr),
|
||||
(void*)gmem_ptr,
|
||||
src_in_bytes);
|
||||
}
|
||||
} else {
|
||||
if constexpr (prefetch_mode == PrefetchMode::kPrefetch) {
|
||||
if (predicate) {
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, 16);
|
||||
}
|
||||
} else {
|
||||
if (predicate) {
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, 16);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -169,10 +173,11 @@ __device__ __forceinline__ void pred_load_64b(T* smem_ptr,
|
||||
if constexpr (fill_mode == SharedMemFillMode::kFillZero) {
|
||||
int src_in_bytes = predicate ? 8 : 0;
|
||||
memset(__cvta_shared_to_generic(smem_int_ptr), 0, 8);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, src_in_bytes);
|
||||
memcpy(
|
||||
__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, src_in_bytes);
|
||||
} else {
|
||||
if (predicate) {
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, 8);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, 8);
|
||||
}
|
||||
}
|
||||
#else
|
||||
@@ -207,10 +212,11 @@ __device__ __forceinline__ void pred_load_32b(T* smem_ptr,
|
||||
if constexpr (fill_mode == SharedMemFillMode::kFillZero) {
|
||||
int src_in_bytes = predicate ? 4 : 0;
|
||||
memset(__cvta_shared_to_generic(smem_int_ptr), 0, 4);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, src_in_bytes);
|
||||
memcpy(
|
||||
__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, src_in_bytes);
|
||||
} else {
|
||||
if (predicate) {
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void *)gmem_ptr, 4);
|
||||
memcpy(__cvta_shared_to_generic(smem_int_ptr), (void*)gmem_ptr, 4);
|
||||
}
|
||||
}
|
||||
#else
|
||||
@@ -275,7 +281,6 @@ struct smem_t {
|
||||
template <typename T>
|
||||
__device__ __forceinline__ smem_t(T* base) : base((b128_t*)base) {}
|
||||
|
||||
|
||||
template <uint32_t stride, uint32_t inv_stride = 0>
|
||||
static __device__ __forceinline__ uint32_t get_permuted_offset(uint32_t i,
|
||||
uint32_t j) {
|
||||
|
||||
@@ -20,10 +20,10 @@
|
||||
template <typename T, int VecSize = 1>
|
||||
__global__ void decode_absorb_cache_kernel(
|
||||
const T* __restrict__ kv_nope, // [bsz, kv_num_heads, pe_size] 512
|
||||
const T* __restrict__ kv_pe, // [bsz, kv_num_heads, nope_size] 64
|
||||
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
|
||||
// nope_size]
|
||||
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
|
||||
const T* __restrict__ kv_pe, // [bsz, kv_num_heads, nope_size] 64
|
||||
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
|
||||
// nope_size]
|
||||
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
|
||||
const int* __restrict__ cu_seqlens_q,
|
||||
const int* __restrict__ seq_lens, // [bsz]
|
||||
const int* __restrict__ seq_lens_encoder, // [bsz]
|
||||
@@ -62,26 +62,25 @@ __global__ void decode_absorb_cache_kernel(
|
||||
const int block_idx = block_table_now[write_seq_id / block_size];
|
||||
const int block_offset = write_seq_id % block_size;
|
||||
|
||||
if (bias < nope_hidden_size) { // pe
|
||||
if (bias < nope_hidden_size) { // pe
|
||||
const uint32_t inner_bias = bias;
|
||||
const uint32_t hi = inner_bias / nope_size;
|
||||
const uint32_t h_bias = inner_bias % nope_size;
|
||||
const uint32_t tgt_idx = block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size +
|
||||
block_offset * all_size + h_bias;
|
||||
const uint32_t ori_idx =
|
||||
start_token_idx * nope_hidden_size + inner_bias;
|
||||
const uint32_t tgt_idx =
|
||||
block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size + block_offset * all_size + h_bias;
|
||||
const uint32_t ori_idx = start_token_idx * nope_hidden_size + inner_bias;
|
||||
Load<T, VecSize>(&kv_nope[ori_idx], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
|
||||
} else {
|
||||
const uint32_t inner_bias = bias - nope_hidden_size;
|
||||
const uint32_t hi = inner_bias / pe_size;
|
||||
const uint32_t h_bias = inner_bias % pe_size;
|
||||
const uint32_t tgt_idx = block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size +
|
||||
block_offset * all_size + nope_size + h_bias;
|
||||
const uint32_t ori_idx =
|
||||
start_token_idx * pe_hidden_size + inner_bias;
|
||||
const uint32_t tgt_idx =
|
||||
block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size + block_offset * all_size + nope_size +
|
||||
h_bias;
|
||||
const uint32_t ori_idx = start_token_idx * pe_hidden_size + inner_bias;
|
||||
Load<T, VecSize>(&kv_pe[ori_idx], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
|
||||
}
|
||||
@@ -91,10 +90,10 @@ __global__ void decode_absorb_cache_kernel(
|
||||
template <typename T, int VecSize = 1>
|
||||
__global__ void speculate_decode_absorb_cache_kernel(
|
||||
const T* __restrict__ kv_nope, // [bsz, kv_num_heads, pe_size] 512
|
||||
const T* __restrict__ kv_pe, // [bsz, kv_num_heads, nope_size] 64
|
||||
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
|
||||
// nope_size]
|
||||
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
|
||||
const T* __restrict__ kv_pe, // [bsz, kv_num_heads, nope_size] 64
|
||||
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
|
||||
// nope_size]
|
||||
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
|
||||
const int* __restrict__ batch_id_per_token,
|
||||
const int* __restrict__ cu_seqlens_q,
|
||||
const int* __restrict__ seq_lens, // [bsz]
|
||||
@@ -125,8 +124,7 @@ __global__ void speculate_decode_absorb_cache_kernel(
|
||||
if (seq_lens[ori_bi] == 0) continue;
|
||||
const int bias = linear_index % hidden_size;
|
||||
const int start_token_idx = cu_seqlens_q[ori_bi];
|
||||
const int write_seq_id =
|
||||
seq_lens[ori_bi] + token_id - start_token_idx;
|
||||
const int write_seq_id = seq_lens[ori_bi] + token_id - start_token_idx;
|
||||
if (write_seq_id == 0) continue;
|
||||
|
||||
const int* block_table_now = nullptr;
|
||||
@@ -145,26 +143,25 @@ __global__ void speculate_decode_absorb_cache_kernel(
|
||||
token_id,
|
||||
cu_seqlens_q[ori_bi]);
|
||||
}
|
||||
if (bias < nope_hidden_size) { // pe
|
||||
if (bias < nope_hidden_size) { // pe
|
||||
const uint32_t inner_bias = bias;
|
||||
const uint32_t hi = inner_bias / nope_size;
|
||||
const uint32_t h_bias = inner_bias % nope_size;
|
||||
const uint32_t tgt_idx = block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size +
|
||||
block_offset * all_size + h_bias;
|
||||
const uint32_t ori_idx =
|
||||
token_id * nope_hidden_size + inner_bias;
|
||||
const uint32_t tgt_idx =
|
||||
block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size + block_offset * all_size + h_bias;
|
||||
const uint32_t ori_idx = token_id * nope_hidden_size + inner_bias;
|
||||
Load<T, VecSize>(&kv_nope[ori_idx], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
|
||||
} else {
|
||||
const uint32_t inner_bias = bias - nope_hidden_size;
|
||||
const uint32_t hi = inner_bias / pe_size;
|
||||
const uint32_t h_bias = inner_bias % pe_size;
|
||||
const uint32_t tgt_idx = block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size +
|
||||
block_offset * all_size + nope_size + h_bias;
|
||||
const uint32_t ori_idx =
|
||||
token_id * pe_hidden_size + inner_bias;
|
||||
const uint32_t tgt_idx =
|
||||
block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size + block_offset * all_size + nope_size +
|
||||
h_bias;
|
||||
const uint32_t ori_idx = token_id * pe_hidden_size + inner_bias;
|
||||
Load<T, VecSize>(&kv_pe[ori_idx], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
|
||||
}
|
||||
@@ -174,10 +171,10 @@ __global__ void speculate_decode_absorb_cache_kernel(
|
||||
template <typename T, int VecSize = 1>
|
||||
__global__ void prefill_absorb_cache_kernel(
|
||||
const T* __restrict__ kv_nope, // [bsz, kv_num_heads, pe_size] 512
|
||||
const T* __restrict__ kv_pe, // [bsz, kv_num_heads, nope_size] 64
|
||||
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
|
||||
// nope_size]
|
||||
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
|
||||
const T* __restrict__ kv_pe, // [bsz, kv_num_heads, nope_size] 64
|
||||
T* __restrict__ kv_cache, // [num_blocks, kv_num_heads, block_size,
|
||||
// nope_size]
|
||||
const int* __restrict__ block_tables, // [bsz, max_blocks_per_seq]
|
||||
const int* __restrict__ batch_id_per_token,
|
||||
const int* __restrict__ cu_seqlens_q,
|
||||
const int* __restrict__ seq_lens, // [bsz]
|
||||
@@ -206,33 +203,33 @@ __global__ void prefill_absorb_cache_kernel(
|
||||
const uint32_t bias = linear_index % hidden_size;
|
||||
const uint32_t ori_bi = batch_id_per_token[token_idx];
|
||||
if (seq_lens[ori_bi] == 0) continue;
|
||||
const uint32_t ori_seq_id = (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
|
||||
const uint32_t ori_seq_id =
|
||||
(token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
|
||||
|
||||
const int* block_table_now = nullptr;
|
||||
block_table_now = block_tables + ori_bi * max_blocks_per_seq;
|
||||
const uint32_t block_idx = block_table_now[ori_seq_id / block_size];
|
||||
const uint32_t block_offset = ori_seq_id % block_size;
|
||||
|
||||
if (bias < nope_hidden_size) { // pe
|
||||
if (bias < nope_hidden_size) { // pe
|
||||
const uint32_t inner_bias = bias;
|
||||
const uint32_t hi = inner_bias / nope_size;
|
||||
const uint32_t h_bias = inner_bias % nope_size;
|
||||
const uint32_t tgt_idx = block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size +
|
||||
block_offset * all_size + h_bias;
|
||||
const uint32_t ori_idx =
|
||||
token_idx * nope_hidden_size + inner_bias;
|
||||
const uint32_t tgt_idx =
|
||||
block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size + block_offset * all_size + h_bias;
|
||||
const uint32_t ori_idx = token_idx * nope_hidden_size + inner_bias;
|
||||
Load<T, VecSize>(&kv_nope[ori_idx], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
|
||||
} else {
|
||||
const uint32_t inner_bias = bias - nope_hidden_size;
|
||||
const uint32_t hi = inner_bias / pe_size;
|
||||
const uint32_t h_bias = inner_bias % pe_size;
|
||||
const uint32_t tgt_idx = block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size +
|
||||
block_offset * all_size + nope_size + h_bias;
|
||||
const uint32_t ori_idx =
|
||||
token_idx * pe_hidden_size + inner_bias;
|
||||
const uint32_t tgt_idx =
|
||||
block_idx * kv_num_heads * block_size * all_size +
|
||||
hi * block_size * all_size + block_offset * all_size + nope_size +
|
||||
h_bias;
|
||||
const uint32_t ori_idx = token_idx * pe_hidden_size + inner_bias;
|
||||
Load<T, VecSize>(&kv_pe[ori_idx], &src_vec);
|
||||
Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
|
||||
}
|
||||
|
||||
@@ -16,37 +16,40 @@
|
||||
#include "decode_attention_func.cuh"
|
||||
#include "multiquery_decoder_attention_kernel.h"
|
||||
|
||||
#define CHECK(call) \
|
||||
do \
|
||||
{ \
|
||||
const cudaError_t error_code = call; \
|
||||
if (error_code != cudaSuccess) \
|
||||
{ \
|
||||
printf("CUDA Error:\n"); \
|
||||
printf(" File: %s\n", __FILE__); \
|
||||
printf(" Line %d:\n", __LINE__); \
|
||||
printf(" Error code:%d\n", error_code); \
|
||||
printf(" Error text:%s\n", cudaGetErrorString(error_code)); \
|
||||
exit(1); \
|
||||
} \
|
||||
}while(0)
|
||||
#define CHECK(call) \
|
||||
do { \
|
||||
const cudaError_t error_code = call; \
|
||||
if (error_code != cudaSuccess) { \
|
||||
printf("CUDA Error:\n"); \
|
||||
printf(" File: %s\n", __FILE__); \
|
||||
printf(" Line %d:\n", __LINE__); \
|
||||
printf(" Error code:%d\n", error_code); \
|
||||
printf(" Error text:%s\n", cudaGetErrorString(error_code)); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
template <typename T, typename OutT, int vec_size, uint32_t bdy, uint32_t HEAD_DIM>
|
||||
__global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi_out, // [bsz, num_chunks, num_heads, head_dim]
|
||||
const T * __restrict__ multi_m, // [bsz, num_chunks, num_heads]
|
||||
const T * __restrict__ multi_d, // [bsz, num_chunks, num_heads]
|
||||
const int * __restrict__ seq_lens_q,
|
||||
const int * __restrict__ seq_lens_kv,
|
||||
const int * __restrict__ cu_seqlens_q,
|
||||
const T * __restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
|
||||
const T * __restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
|
||||
OutT * __restrict__ out, // [token_num, num_heads, head_dim]
|
||||
const float in_scale,
|
||||
const int num_chunks,
|
||||
const int chunk_size,
|
||||
const int max_seq_len,
|
||||
const int num_heads,
|
||||
const int head_dim) {
|
||||
template <typename T,
|
||||
typename OutT,
|
||||
int vec_size,
|
||||
uint32_t bdy,
|
||||
uint32_t HEAD_DIM>
|
||||
__global__ void merge_varlen_multi_chunks_v2_kernel(
|
||||
const T *__restrict__ multi_out, // [bsz, num_chunks, num_heads, head_dim]
|
||||
const T *__restrict__ multi_m, // [bsz, num_chunks, num_heads]
|
||||
const T *__restrict__ multi_d, // [bsz, num_chunks, num_heads]
|
||||
const int *__restrict__ seq_lens_q,
|
||||
const int *__restrict__ seq_lens_kv,
|
||||
const int *__restrict__ cu_seqlens_q,
|
||||
const T *__restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
|
||||
const T *__restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
|
||||
OutT *__restrict__ out, // [token_num, num_heads, head_dim]
|
||||
const float in_scale,
|
||||
const int num_chunks,
|
||||
const int chunk_size,
|
||||
const int max_seq_len,
|
||||
const int num_heads,
|
||||
const int head_dim) {
|
||||
const int vid = threadIdx.x, ty = threadIdx.y;
|
||||
const int qid = blockIdx.x, hid = blockIdx.y;
|
||||
const int seq_len_q = seq_lens_q[qid];
|
||||
@@ -68,12 +71,12 @@ __global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi
|
||||
if constexpr (std::is_same<T, half>::value) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < vec_size / 2; ++i) {
|
||||
*((half2*)(&res_vec) + i) = make_half2(0, 0);
|
||||
*((half2 *)(&res_vec) + i) = make_half2(0, 0);
|
||||
}
|
||||
} else if constexpr (std::is_same<T, nv_bfloat16>::value) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < vec_size / 2; ++i) {
|
||||
*((nv_bfloat162*)(&res_vec) + i) = make_bfloat162(0, 0);
|
||||
*((nv_bfloat162 *)(&res_vec) + i) = make_bfloat162(0, 0);
|
||||
}
|
||||
}
|
||||
T m;
|
||||
@@ -92,7 +95,8 @@ __global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi
|
||||
const T m_now = multi_m[offset];
|
||||
const T d_now = multi_d[offset];
|
||||
m = m_prev > m_now ? m_prev : m_now;
|
||||
offset = (qid * num_chunks * num_heads + i * num_heads + hid) * head_dim + vid * vec_size;
|
||||
offset = (qid * num_chunks * num_heads + i * num_heads + hid) * head_dim +
|
||||
vid * vec_size;
|
||||
Load<T, vec_size>(&multi_out[offset], &load_vec);
|
||||
const T scale1 = hexp(m_prev - m), scale2 = hexp(m_now - m);
|
||||
d = d * scale1 + d_now * scale2;
|
||||
@@ -124,30 +128,47 @@ __global__ void merge_varlen_multi_chunks_v2_kernel(const T * __restrict__ multi
|
||||
for (int i = 0; i < vec_size; ++i) {
|
||||
out_vec[i] = static_cast<OutT>(st.o[i]);
|
||||
}
|
||||
Store<OutT, vec_size>(out_vec, &out[(start_token_ids * num_heads + hid) * head_dim + vid * vec_size]);
|
||||
Store<OutT, vec_size>(
|
||||
out_vec,
|
||||
&out[(start_token_ids * num_heads + hid) * head_dim + vid * vec_size]);
|
||||
}
|
||||
|
||||
template <bool partition_kv, typename T, typename OutT, typename CacheT, uint32_t NUM_STAGES, uint32_t DEAL_EACH_TIME, uint32_t GROUP_SIZE, uint32_t HEAD_DIM_QK, uint32_t HEAD_DIM_V,
|
||||
uint32_t BLOCK_SIZE, uint32_t VEC_SIZE, uint32_t CACHE_VEC_SIZE, uint32_t bdx, uint32_t bdy>
|
||||
__global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [token_num, num_heads, head_dim]
|
||||
CacheT * __restrict__ cache_k, // [max_block_num, num_heads, block_size, head_dim]
|
||||
CacheT * __restrict__ cache_v,
|
||||
const T * __restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
|
||||
const T * __restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
|
||||
const int * __restrict__ seq_lens_q,
|
||||
const int * __restrict__ seq_lens_kv,
|
||||
const int * __restrict__ cu_seqlens_q,
|
||||
const int * __restrict__ block_table, // [bsz, block_num_per_seq]
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const int max_block_num_per_seq,
|
||||
const float scale,
|
||||
const float in_scale,
|
||||
const uint32_t chunk_size,
|
||||
T * __restrict__ tmp_workspace, // [batch_size, num_chunks, num_heads, head_dim]
|
||||
T * __restrict__ tmp_m, // [batch_size, num_chunks, num_heads]
|
||||
T * __restrict__ tmp_d, // [batch_size, num_chunks, num_heads]
|
||||
OutT * __restrict__ out) {
|
||||
template <bool partition_kv,
|
||||
typename T,
|
||||
typename OutT,
|
||||
typename CacheT,
|
||||
uint32_t NUM_STAGES,
|
||||
uint32_t DEAL_EACH_TIME,
|
||||
uint32_t GROUP_SIZE,
|
||||
uint32_t HEAD_DIM_QK,
|
||||
uint32_t HEAD_DIM_V,
|
||||
uint32_t BLOCK_SIZE,
|
||||
uint32_t VEC_SIZE,
|
||||
uint32_t CACHE_VEC_SIZE,
|
||||
uint32_t bdx,
|
||||
uint32_t bdy>
|
||||
__global__ void multi_query_decode_attention_kernel(
|
||||
T *__restrict__ q, // [token_num, num_heads, head_dim]
|
||||
CacheT *__restrict__ cache_k, // [max_block_num, num_heads, block_size,
|
||||
// head_dim]
|
||||
CacheT *__restrict__ cache_v,
|
||||
const T *__restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
|
||||
const T *__restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
|
||||
const int *__restrict__ seq_lens_q,
|
||||
const int *__restrict__ seq_lens_kv,
|
||||
const int *__restrict__ cu_seqlens_q,
|
||||
const int *__restrict__ block_table, // [bsz, block_num_per_seq]
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const int max_block_num_per_seq,
|
||||
const float scale,
|
||||
const float in_scale,
|
||||
const uint32_t chunk_size,
|
||||
T *__restrict__ tmp_workspace, // [batch_size, num_chunks, num_heads,
|
||||
// head_dim]
|
||||
T *__restrict__ tmp_m, // [batch_size, num_chunks, num_heads]
|
||||
T *__restrict__ tmp_d, // [batch_size, num_chunks, num_heads]
|
||||
OutT *__restrict__ out) {
|
||||
const uint32_t bidx = blockIdx.x, kv_head_idx = blockIdx.z;
|
||||
const uint32_t bid = bidx, gid = threadIdx.y;
|
||||
const uint32_t tidx = threadIdx.x;
|
||||
@@ -167,9 +188,9 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
|
||||
if (q_len <= 0) {
|
||||
return;
|
||||
}
|
||||
uint32_t kv_len = seq_lens_kv[bid]; // !!!!!!!!
|
||||
uint32_t kv_len = seq_lens_kv[bid]; // !!!!!!!!
|
||||
if (kv_len <= 0) {
|
||||
return;
|
||||
return;
|
||||
}
|
||||
kv_len += q_len;
|
||||
const uint32_t num_chunk_this_seq = div_up(kv_len, chunk_size);
|
||||
@@ -180,23 +201,24 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
|
||||
}
|
||||
|
||||
const uint32_t chunk_start = partition_kv ? chunk_id * chunk_size : 0;
|
||||
const uint32_t chunk_end = partition_kv ? min(kv_len, chunk_start + chunk_size) : kv_len;
|
||||
const uint32_t chunk_end =
|
||||
partition_kv ? min(kv_len, chunk_start + chunk_size) : kv_len;
|
||||
const uint32_t chunk_len = chunk_end - chunk_start;
|
||||
|
||||
extern __shared__ uint8_t smem[];
|
||||
const T *q_now = q + (q_start_idx * q_num_heads + q_head_idx) * HEAD_DIM_QK;
|
||||
T *q_smem = reinterpret_cast<T*>(smem); // [HEAD_DIM_QK * sizeof(T)]
|
||||
T *q_smem = reinterpret_cast<T *>(smem); // [HEAD_DIM_QK * sizeof(T)]
|
||||
T *cu_q_smem = q_smem + gid * HEAD_DIM_QK;
|
||||
#pragma unroll
|
||||
for(uint32_t vid = tidx; vid < num_vec_per_head_qk; vid += bdx) {
|
||||
((float4*)(&cu_q_smem[vid * VEC_SIZE]))[0] = ((float4*)(&q_now[vid * VEC_SIZE]))[0];
|
||||
|
||||
for (uint32_t vid = tidx; vid < num_vec_per_head_qk; vid += bdx) {
|
||||
((float4 *)(&cu_q_smem[vid * VEC_SIZE]))[0] =
|
||||
((float4 *)(&q_now[vid * VEC_SIZE]))[0];
|
||||
}
|
||||
__syncthreads();
|
||||
using VecT = AlignedVector<T, VEC_SIZE>;
|
||||
VecT q_vec;
|
||||
#pragma unroll
|
||||
for(uint32_t vid = tidx; vid < num_vec_per_head_qk; vid += bdx) {
|
||||
for (uint32_t vid = tidx; vid < num_vec_per_head_qk; vid += bdx) {
|
||||
Load<T, VEC_SIZE>(cu_q_smem + vid * VEC_SIZE, &q_vec);
|
||||
for (uint32_t i = 0; i < VEC_SIZE; ++i) {
|
||||
q_vec[i] *= scale;
|
||||
@@ -204,8 +226,8 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
|
||||
Store<T, VEC_SIZE>(q_vec, cu_q_smem + vid * VEC_SIZE);
|
||||
}
|
||||
|
||||
|
||||
CacheT *kv_smem = reinterpret_cast<CacheT*>(smem + GROUP_SIZE * HEAD_DIM_QK * sizeof(CacheT));
|
||||
CacheT *kv_smem = reinterpret_cast<CacheT *>(smem + GROUP_SIZE * HEAD_DIM_QK *
|
||||
sizeof(CacheT));
|
||||
uint32_t stage_idx = 0;
|
||||
constexpr int loop_times = DEAL_EACH_TIME / bdy;
|
||||
#pragma unroll
|
||||
@@ -214,24 +236,27 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
|
||||
for (int j = 0; j < loop_times; ++j) {
|
||||
const uint32_t k_seq_offset = i * DEAL_EACH_TIME + j * bdy + gid;
|
||||
const uint32_t k_seq_id = chunk_start + k_seq_offset;
|
||||
produce_kv<SharedMemFillMode::kNoFill, HEAD_DIM_QK, VEC_SIZE, num_vec_per_head_qk, bdx, BLOCK_SIZE, CACHE_VEC_SIZE>(
|
||||
kv_smem,
|
||||
cache_k,
|
||||
block_table_now,
|
||||
k_seq_id,
|
||||
k_seq_offset,
|
||||
kv_head_idx,
|
||||
kv_num_heads,
|
||||
tidx,
|
||||
chunk_start,
|
||||
chunk_end
|
||||
);
|
||||
produce_kv<SharedMemFillMode::kNoFill,
|
||||
HEAD_DIM_QK,
|
||||
VEC_SIZE,
|
||||
num_vec_per_head_qk,
|
||||
bdx,
|
||||
BLOCK_SIZE,
|
||||
CACHE_VEC_SIZE>(kv_smem,
|
||||
cache_k,
|
||||
block_table_now,
|
||||
k_seq_id,
|
||||
k_seq_offset,
|
||||
kv_head_idx,
|
||||
kv_num_heads,
|
||||
tidx,
|
||||
chunk_start,
|
||||
chunk_end);
|
||||
}
|
||||
commit_group();
|
||||
stage_idx = (stage_idx + 1) % NUM_STAGES;
|
||||
}
|
||||
|
||||
|
||||
softmax_state_ts<VEC_SIZE, T, num_tile_v> st;
|
||||
float s[DEAL_EACH_TIME];
|
||||
|
||||
@@ -240,48 +265,55 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
|
||||
wait_group<NUM_STAGES - 1>();
|
||||
__syncthreads();
|
||||
// compute qk
|
||||
compute_qk<VEC_SIZE, num_vec_per_head_qk, bdx, bdy, HEAD_DIM_QK, DEAL_EACH_TIME, num_tile_v>(
|
||||
cu_q_smem,
|
||||
kv_smem,
|
||||
chunk_start + iter * DEAL_EACH_TIME,
|
||||
stage_idx,
|
||||
iter * DEAL_EACH_TIME,
|
||||
chunk_len,
|
||||
tidx,
|
||||
gid,
|
||||
scale,
|
||||
s,
|
||||
st
|
||||
);
|
||||
compute_qk<VEC_SIZE,
|
||||
num_vec_per_head_qk,
|
||||
bdx,
|
||||
bdy,
|
||||
HEAD_DIM_QK,
|
||||
DEAL_EACH_TIME,
|
||||
num_tile_v>(cu_q_smem,
|
||||
kv_smem,
|
||||
chunk_start + iter * DEAL_EACH_TIME,
|
||||
stage_idx,
|
||||
iter * DEAL_EACH_TIME,
|
||||
chunk_len,
|
||||
tidx,
|
||||
gid,
|
||||
scale,
|
||||
s,
|
||||
st);
|
||||
__syncthreads();
|
||||
|
||||
// compute sv
|
||||
compute_sv<VEC_SIZE, num_vec_per_head_v, bdx, DEAL_EACH_TIME, HEAD_DIM_QK, num_tile_v>(
|
||||
s,
|
||||
kv_smem,
|
||||
stage_idx,
|
||||
iter * DEAL_EACH_TIME,
|
||||
chunk_len,
|
||||
tidx,
|
||||
st
|
||||
);
|
||||
compute_sv<VEC_SIZE,
|
||||
num_vec_per_head_v,
|
||||
bdx,
|
||||
DEAL_EACH_TIME,
|
||||
HEAD_DIM_QK,
|
||||
num_tile_v>(
|
||||
s, kv_smem, stage_idx, iter * DEAL_EACH_TIME, chunk_len, tidx, st);
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < loop_times; ++j) {
|
||||
const uint32_t k_seq_offset = j * bdy + gid;
|
||||
produce_kv<SharedMemFillMode::kNoFill, HEAD_DIM_QK, VEC_SIZE, num_vec_per_head_qk, bdx, BLOCK_SIZE, CACHE_VEC_SIZE>(
|
||||
kv_smem,
|
||||
cache_k,
|
||||
block_table_now,
|
||||
chunk_start + k_seq_offset + (iter + NUM_STAGES) * DEAL_EACH_TIME,
|
||||
stage_idx * DEAL_EACH_TIME + k_seq_offset,
|
||||
kv_head_idx,
|
||||
kv_num_heads,
|
||||
tidx,
|
||||
chunk_start,
|
||||
chunk_end
|
||||
);
|
||||
produce_kv<SharedMemFillMode::kNoFill,
|
||||
HEAD_DIM_QK,
|
||||
VEC_SIZE,
|
||||
num_vec_per_head_qk,
|
||||
bdx,
|
||||
BLOCK_SIZE,
|
||||
CACHE_VEC_SIZE>(
|
||||
kv_smem,
|
||||
cache_k,
|
||||
block_table_now,
|
||||
chunk_start + k_seq_offset + (iter + NUM_STAGES) * DEAL_EACH_TIME,
|
||||
stage_idx * DEAL_EACH_TIME + k_seq_offset,
|
||||
kv_head_idx,
|
||||
kv_num_heads,
|
||||
tidx,
|
||||
chunk_start,
|
||||
chunk_end);
|
||||
}
|
||||
commit_group();
|
||||
stage_idx = (stage_idx + 1) % NUM_STAGES;
|
||||
@@ -290,45 +322,59 @@ __global__ void multi_query_decode_attention_kernel(T * __restrict__ q, // [toke
|
||||
__syncthreads();
|
||||
|
||||
// normize if not partition_kv
|
||||
for(uint32_t vid = tidx; vid < num_vec_per_head_v; vid += bdx) {
|
||||
for (uint32_t vid = tidx; vid < num_vec_per_head_v; vid += bdx) {
|
||||
const uint32_t tile_id = vid / bdx;
|
||||
if (!partition_kv || num_chunk_this_seq == 1) {
|
||||
st.normalize(tile_id);
|
||||
}
|
||||
if (partition_kv && num_chunk_this_seq > 1) {
|
||||
const uint32_t head_idx = (bid * num_chunks + chunk_id) * q_num_heads + q_head_idx;
|
||||
Store<T, VEC_SIZE>(st.o[tile_id], tmp_workspace + head_idx * HEAD_DIM_V + vid * VEC_SIZE);
|
||||
const uint32_t head_idx =
|
||||
(bid * num_chunks + chunk_id) * q_num_heads + q_head_idx;
|
||||
Store<T, VEC_SIZE>(
|
||||
st.o[tile_id],
|
||||
tmp_workspace + head_idx * HEAD_DIM_V + vid * VEC_SIZE);
|
||||
tmp_m[head_idx] = st.m;
|
||||
tmp_d[head_idx] = st.d;
|
||||
} else {
|
||||
Store<OutT, VEC_SIZE>(st.o[tile_id], out + (q_write_idx * q_num_heads + q_head_idx) * HEAD_DIM_V + vid * VEC_SIZE);
|
||||
Store<OutT, VEC_SIZE>(
|
||||
st.o[tile_id],
|
||||
out + (q_write_idx * q_num_heads + q_head_idx) * HEAD_DIM_V +
|
||||
vid * VEC_SIZE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T, uint32_t GROUP_SIZE, uint32_t HEAD_DIM_QK, uint32_t HEAD_DIM_V, uint32_t BLOCK_SIZE, bool CAUSAL, uint32_t NUM_STAGE, uint32_t cache_bytes, uint32_t DEAL_EACH_TIME>
|
||||
template <typename T,
|
||||
uint32_t GROUP_SIZE,
|
||||
uint32_t HEAD_DIM_QK,
|
||||
uint32_t HEAD_DIM_V,
|
||||
uint32_t BLOCK_SIZE,
|
||||
bool CAUSAL,
|
||||
uint32_t NUM_STAGE,
|
||||
uint32_t cache_bytes,
|
||||
uint32_t DEAL_EACH_TIME>
|
||||
void MultiQueryDecoderAttention(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
cudaStream_t &stream,
|
||||
const paddle::Tensor &q,
|
||||
const paddle::Tensor &cache_k, // [max_block_num, num_kv_heads, block_size, head_dim]
|
||||
const paddle::Tensor &cache_v, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>& shift_bias,
|
||||
const paddle::optional<paddle::Tensor>& smooth_weight,
|
||||
const paddle::Tensor &seq_lens_q,
|
||||
const paddle::Tensor &seq_lens_kv,
|
||||
const paddle::Tensor &batch_id_per_token,
|
||||
const paddle::Tensor &cu_seqlens_q,
|
||||
const paddle::Tensor &block_table,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float rope_scale,
|
||||
const float rope_theta,
|
||||
const float softmax_scale,
|
||||
const float in_scale,
|
||||
paddle::Tensor *out) {
|
||||
const AppendAttnMetaData &meta_data,
|
||||
cudaStream_t &stream,
|
||||
const paddle::Tensor &q,
|
||||
const paddle::Tensor
|
||||
&cache_k, // [max_block_num, num_kv_heads, block_size, head_dim]
|
||||
const paddle::Tensor &cache_v, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor> &attn_mask,
|
||||
const paddle::optional<paddle::Tensor> &shift_bias,
|
||||
const paddle::optional<paddle::Tensor> &smooth_weight,
|
||||
const paddle::Tensor &seq_lens_q,
|
||||
const paddle::Tensor &seq_lens_kv,
|
||||
const paddle::Tensor &batch_id_per_token,
|
||||
const paddle::Tensor &cu_seqlens_q,
|
||||
const paddle::Tensor &block_table,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float rope_scale,
|
||||
const float rope_theta,
|
||||
const float softmax_scale,
|
||||
const float in_scale,
|
||||
paddle::Tensor *out) {
|
||||
using NV_TYPE = typename cascade_attn_type_traits<T>::type;
|
||||
|
||||
auto num_heads = meta_data.q_num_heads;
|
||||
@@ -338,8 +384,8 @@ void MultiQueryDecoderAttention(
|
||||
auto max_block_num_per_seq = meta_data.max_blocks_per_seq;
|
||||
constexpr int num_stages = NUM_STAGE;
|
||||
|
||||
constexpr int vec_size = 16 / sizeof(T); // 8 16 32
|
||||
constexpr int cache_vec_size = 128 / cache_bytes; // 8 16 32
|
||||
constexpr int vec_size = 16 / sizeof(T); // 8 16 32
|
||||
constexpr int cache_vec_size = 128 / cache_bytes; // 8 16 32
|
||||
constexpr int blockxc = HEAD_DIM_QK / cache_vec_size;
|
||||
constexpr int num_vec_per_head = HEAD_DIM_QK / vec_size;
|
||||
constexpr int blockx = num_vec_per_head < 32 ? num_vec_per_head : 32;
|
||||
@@ -349,12 +395,25 @@ void MultiQueryDecoderAttention(
|
||||
|
||||
constexpr int num_threads = blockx * blocky;
|
||||
|
||||
auto splitkv_kernel = multi_query_decode_attention_kernel<true, NV_TYPE, NV_TYPE, NV_TYPE, num_stages, DEAL_EACH_TIME, GROUP_SIZE, HEAD_DIM_QK, HEAD_DIM_V,
|
||||
BLOCK_SIZE, vec_size, cache_vec_size, blockx, blocky>;
|
||||
auto splitkv_kernel = multi_query_decode_attention_kernel<true,
|
||||
NV_TYPE,
|
||||
NV_TYPE,
|
||||
NV_TYPE,
|
||||
num_stages,
|
||||
DEAL_EACH_TIME,
|
||||
GROUP_SIZE,
|
||||
HEAD_DIM_QK,
|
||||
HEAD_DIM_V,
|
||||
BLOCK_SIZE,
|
||||
vec_size,
|
||||
cache_vec_size,
|
||||
blockx,
|
||||
blocky>;
|
||||
uint32_t cache_smem_bytes = 0;
|
||||
|
||||
const T *shift_bias_ptr = shift_bias ? shift_bias.get().data<T>() : nullptr;
|
||||
const T *smooth_weight_ptr = smooth_weight ? smooth_weight.get().data<T>() : nullptr;
|
||||
const T *smooth_weight_ptr =
|
||||
smooth_weight ? smooth_weight.get().data<T>() : nullptr;
|
||||
cache_smem_bytes = num_stages * DEAL_EACH_TIME * HEAD_DIM_QK * sizeof(T);
|
||||
|
||||
const uint32_t chunk_size = get_max_partition_size(bsz);
|
||||
@@ -363,51 +422,64 @@ void MultiQueryDecoderAttention(
|
||||
|
||||
if (smem_size >= 48 * 1024) {
|
||||
cudaFuncSetAttribute(
|
||||
splitkv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
|
||||
splitkv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
|
||||
}
|
||||
const int dev_id = 0;
|
||||
int sm_count;
|
||||
int act_blocks_per_sm;
|
||||
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&act_blocks_per_sm, splitkv_kernel, num_threads, smem_size);
|
||||
&act_blocks_per_sm, splitkv_kernel, num_threads, smem_size);
|
||||
assert(act_blocks_per_sm > 1);
|
||||
|
||||
const int num_blocks_per_wave = sm_count * act_blocks_per_sm;
|
||||
const int num_blocks_need = gridx * num_chunks * kv_num_heads;
|
||||
const int max_num_chunks = div_up(num_blocks_per_wave, num_blocks_need);
|
||||
const float ratio = static_cast<float>(num_blocks_need) / static_cast<float>(num_blocks_per_wave);
|
||||
const float ratio = static_cast<float>(num_blocks_need) /
|
||||
static_cast<float>(num_blocks_per_wave);
|
||||
|
||||
dim3 grids(gridx, num_chunks, kv_num_heads);
|
||||
dim3 blocks(blockx, blocky);
|
||||
if (num_chunks <= 1) {
|
||||
auto no_splitkv_kernel = multi_query_decode_attention_kernel<false, NV_TYPE, NV_TYPE, NV_TYPE, num_stages, DEAL_EACH_TIME, GROUP_SIZE, HEAD_DIM_QK, HEAD_DIM_V, BLOCK_SIZE, vec_size,
|
||||
cache_vec_size, blockx, blocky>;
|
||||
auto no_splitkv_kernel = multi_query_decode_attention_kernel<false,
|
||||
NV_TYPE,
|
||||
NV_TYPE,
|
||||
NV_TYPE,
|
||||
num_stages,
|
||||
DEAL_EACH_TIME,
|
||||
GROUP_SIZE,
|
||||
HEAD_DIM_QK,
|
||||
HEAD_DIM_V,
|
||||
BLOCK_SIZE,
|
||||
vec_size,
|
||||
cache_vec_size,
|
||||
blockx,
|
||||
blocky>;
|
||||
if (smem_size >= 48 * 1024) {
|
||||
cudaFuncSetAttribute(
|
||||
no_splitkv_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
|
||||
cudaFuncSetAttribute(no_splitkv_kernel,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
smem_size);
|
||||
}
|
||||
no_splitkv_kernel<<<grids, blocks, smem_size, stream>>>(
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(q.data<T>())),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(cache_k.data<T>())),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(cache_v.data<T>())),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(shift_bias_ptr)),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
|
||||
seq_lens_q.data<int>(),
|
||||
seq_lens_kv.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
block_table.data<int>(),
|
||||
max_seq_len,
|
||||
max_dec_len,
|
||||
max_block_num_per_seq,
|
||||
softmax_scale,
|
||||
in_scale,
|
||||
chunk_size,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(out->data<T>()))
|
||||
);
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(q.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(shift_bias_ptr)),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(smooth_weight_ptr)),
|
||||
seq_lens_q.data<int>(),
|
||||
seq_lens_kv.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
block_table.data<int>(),
|
||||
max_seq_len,
|
||||
max_dec_len,
|
||||
max_block_num_per_seq,
|
||||
softmax_scale,
|
||||
in_scale,
|
||||
chunk_size,
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(out->data<T>())));
|
||||
|
||||
// CHECK(cudaGetLastError());
|
||||
// CHECK(cudaDeviceSynchronize());
|
||||
@@ -417,34 +489,33 @@ void MultiQueryDecoderAttention(
|
||||
tmp_workspace = allocator->Allocate(
|
||||
phi::SizeOf(q.dtype()) *
|
||||
static_cast<size_t>(bsz * num_chunks * num_heads * HEAD_DIM_V));
|
||||
tmp_m = allocator->Allocate(
|
||||
phi::SizeOf(q.dtype()) *
|
||||
static_cast<size_t>(bsz * num_chunks * num_heads));
|
||||
tmp_d = allocator->Allocate(
|
||||
phi::SizeOf(q.dtype()) *
|
||||
static_cast<size_t>(bsz * num_chunks * num_heads));
|
||||
tmp_m =
|
||||
allocator->Allocate(phi::SizeOf(q.dtype()) *
|
||||
static_cast<size_t>(bsz * num_chunks * num_heads));
|
||||
tmp_d =
|
||||
allocator->Allocate(phi::SizeOf(q.dtype()) *
|
||||
static_cast<size_t>(bsz * num_chunks * num_heads));
|
||||
|
||||
splitkv_kernel<<<grids, blocks, smem_size, stream>>>(
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(q.data<T>())),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(cache_k.data<T>())),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(cache_v.data<T>())),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(shift_bias_ptr)),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
|
||||
seq_lens_q.data<int>(),
|
||||
seq_lens_kv.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
block_table.data<int>(),
|
||||
max_seq_len,
|
||||
max_dec_len,
|
||||
max_block_num_per_seq,
|
||||
softmax_scale,
|
||||
in_scale,
|
||||
chunk_size,
|
||||
reinterpret_cast<NV_TYPE*>(tmp_workspace->ptr()),
|
||||
reinterpret_cast<NV_TYPE*>(tmp_m->ptr()),
|
||||
reinterpret_cast<NV_TYPE*>(tmp_d->ptr()),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(out->data<T>()))
|
||||
);
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(q.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_k.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(cache_v.data<T>())),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(shift_bias_ptr)),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(smooth_weight_ptr)),
|
||||
seq_lens_q.data<int>(),
|
||||
seq_lens_kv.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
block_table.data<int>(),
|
||||
max_seq_len,
|
||||
max_dec_len,
|
||||
max_block_num_per_seq,
|
||||
softmax_scale,
|
||||
in_scale,
|
||||
chunk_size,
|
||||
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
|
||||
reinterpret_cast<NV_TYPE *>(tmp_m->ptr()),
|
||||
reinterpret_cast<NV_TYPE *>(tmp_d->ptr()),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(out->data<T>())));
|
||||
// CHECK(cudaGetLastError());
|
||||
// CHECK(cudaDeviceSynchronize());
|
||||
|
||||
@@ -452,23 +523,27 @@ void MultiQueryDecoderAttention(
|
||||
constexpr int bdy = 256 / mblockx;
|
||||
dim3 grids_merge(bsz, num_heads);
|
||||
dim3 blocks_merge(mblockx, bdy);
|
||||
merge_varlen_multi_chunks_v2_kernel<NV_TYPE, NV_TYPE, vec_size, bdy, HEAD_DIM_V><<<grids_merge, blocks_merge, 0, stream>>>(
|
||||
reinterpret_cast<NV_TYPE*>(tmp_workspace->ptr()),
|
||||
reinterpret_cast<NV_TYPE*>(tmp_m->ptr()),
|
||||
reinterpret_cast<NV_TYPE*>(tmp_d->ptr()),
|
||||
seq_lens_q.data<int>(),
|
||||
seq_lens_kv.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(shift_bias_ptr)),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(smooth_weight_ptr)),
|
||||
reinterpret_cast<NV_TYPE*>(const_cast<T*>(out->data<T>())),
|
||||
in_scale,
|
||||
num_chunks,
|
||||
chunk_size,
|
||||
max_seq_len,
|
||||
num_heads,
|
||||
HEAD_DIM_V
|
||||
);
|
||||
merge_varlen_multi_chunks_v2_kernel<NV_TYPE,
|
||||
NV_TYPE,
|
||||
vec_size,
|
||||
bdy,
|
||||
HEAD_DIM_V>
|
||||
<<<grids_merge, blocks_merge, 0, stream>>>(
|
||||
reinterpret_cast<NV_TYPE *>(tmp_workspace->ptr()),
|
||||
reinterpret_cast<NV_TYPE *>(tmp_m->ptr()),
|
||||
reinterpret_cast<NV_TYPE *>(tmp_d->ptr()),
|
||||
seq_lens_q.data<int>(),
|
||||
seq_lens_kv.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(shift_bias_ptr)),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(smooth_weight_ptr)),
|
||||
reinterpret_cast<NV_TYPE *>(const_cast<T *>(out->data<T>())),
|
||||
in_scale,
|
||||
num_chunks,
|
||||
chunk_size,
|
||||
max_seq_len,
|
||||
num_heads,
|
||||
HEAD_DIM_V);
|
||||
}
|
||||
// CHECK(cudaGetLastError());
|
||||
// CHECK(cudaDeviceSynchronize());
|
||||
|
||||
@@ -15,25 +15,34 @@
|
||||
|
||||
#include "decode_attention_func.cuh"
|
||||
|
||||
template <typename T, uint32_t GROUP_SIZE, uint32_t HEAD_DIM_QK, uint32_t HEAD_DIM_V, uint32_t BLOCK_SIZE, bool CAUSAL, uint32_t NUM_STAGE, uint32_t cache_bytes, uint32_t DEAL_EACH_TIME>
|
||||
template <typename T,
|
||||
uint32_t GROUP_SIZE,
|
||||
uint32_t HEAD_DIM_QK,
|
||||
uint32_t HEAD_DIM_V,
|
||||
uint32_t BLOCK_SIZE,
|
||||
bool CAUSAL,
|
||||
uint32_t NUM_STAGE,
|
||||
uint32_t cache_bytes,
|
||||
uint32_t DEAL_EACH_TIME>
|
||||
void MultiQueryDecoderAttention(
|
||||
const AppendAttnMetaData& meta_data,
|
||||
cudaStream_t &stream,
|
||||
const paddle::Tensor &q,
|
||||
const paddle::Tensor &cache_k, // [max_block_num, num_kv_heads, block_size, head_dim]
|
||||
const paddle::Tensor &cache_v, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor>& attn_mask,
|
||||
const paddle::optional<paddle::Tensor>& shift_bias,
|
||||
const paddle::optional<paddle::Tensor>& smooth_weight,
|
||||
const paddle::Tensor &seq_lens_q,
|
||||
const paddle::Tensor &seq_lens_kv,
|
||||
const paddle::Tensor &batch_id_per_token,
|
||||
const paddle::Tensor &cu_seqlens_q,
|
||||
const paddle::Tensor &block_table,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float rope_scale,
|
||||
const float rope_theta,
|
||||
const float softmax_scale,
|
||||
const float in_scale,
|
||||
paddle::Tensor *out);
|
||||
const AppendAttnMetaData &meta_data,
|
||||
cudaStream_t &stream,
|
||||
const paddle::Tensor &q,
|
||||
const paddle::Tensor
|
||||
&cache_k, // [max_block_num, num_kv_heads, block_size, head_dim]
|
||||
const paddle::Tensor &cache_v, // [num_kv_heads, head_dim]
|
||||
const paddle::optional<paddle::Tensor> &attn_mask,
|
||||
const paddle::optional<paddle::Tensor> &shift_bias,
|
||||
const paddle::optional<paddle::Tensor> &smooth_weight,
|
||||
const paddle::Tensor &seq_lens_q,
|
||||
const paddle::Tensor &seq_lens_kv,
|
||||
const paddle::Tensor &batch_id_per_token,
|
||||
const paddle::Tensor &cu_seqlens_q,
|
||||
const paddle::Tensor &block_table,
|
||||
const int max_seq_len,
|
||||
const int max_dec_len,
|
||||
const float rope_scale,
|
||||
const float rope_theta,
|
||||
const float softmax_scale,
|
||||
const float in_scale,
|
||||
paddle::Tensor *out);
|
||||
|
||||
@@ -27,7 +27,7 @@ struct AppendAttnMetaData {
|
||||
int head_dims;
|
||||
int head_dims_v;
|
||||
int max_blocks_per_seq;
|
||||
const int *mask_offset = nullptr;
|
||||
const int* mask_offset = nullptr;
|
||||
};
|
||||
|
||||
__forceinline__ __host__ __device__ int div_up(int a, int b) {
|
||||
@@ -110,29 +110,33 @@ __device__ __forceinline__ uint32_t sub_if_greater_or_zero(uint32_t x,
|
||||
|
||||
/******************************FASTER CAST*********************************/
|
||||
|
||||
inline __device__ static void convert_fp8(__nv_bfloat16* result, const uint32_t& source) {
|
||||
|
||||
inline __device__ static void convert_fp8(__nv_bfloat16* result,
|
||||
const uint32_t& source) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
|
||||
uint32_t dest0;
|
||||
uint32_t dest1;
|
||||
asm volatile( \
|
||||
"{\n" \
|
||||
".reg .b16 lo, hi;\n" \
|
||||
"mov.b32 {lo, hi}, %2;\n" \
|
||||
"cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
|
||||
"cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
|
||||
"}\n" : "=r"(dest0), "=r"(dest1) : "r"(source));
|
||||
uint32_t dest0;
|
||||
uint32_t dest1;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b16 lo, hi;\n"
|
||||
"mov.b32 {lo, hi}, %2;\n"
|
||||
"cvt.rn.f16x2.e4m3x2 %0, lo;\n"
|
||||
"cvt.rn.f16x2.e4m3x2 %1, hi;\n"
|
||||
"}\n"
|
||||
: "=r"(dest0), "=r"(dest1)
|
||||
: "r"(source));
|
||||
|
||||
((nv_bfloat162*)(result))[0] = __float22bfloat162_rn(__half22float2(((half2*)(&dest0))[0]));
|
||||
((nv_bfloat162*)(result))[1] = __float22bfloat162_rn(__half22float2(((half2*)(&dest1))[0]));
|
||||
((nv_bfloat162*)(result))[0] =
|
||||
__float22bfloat162_rn(__half22float2(((half2*)(&dest0))[0]));
|
||||
((nv_bfloat162*)(result))[1] =
|
||||
__float22bfloat162_rn(__half22float2(((half2*)(&dest1))[0]));
|
||||
#else
|
||||
printf("Do not support fp8 in arch < 890\n");
|
||||
asm("trap;");
|
||||
printf("Do not support fp8 in arch < 890\n");
|
||||
asm("trap;");
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
inline __device__ static void convert_fp8(half* result, const uint32_t& source) {
|
||||
inline __device__ static void convert_fp8(half* result,
|
||||
const uint32_t& source) {
|
||||
printf("Do not support fp8 to half although it's very easy.\n");
|
||||
}
|
||||
|
||||
@@ -301,8 +305,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
||||
|
||||
#define DISPATCH_HEAD_DIM(head_dim, HEAD_DIM, ...) \
|
||||
switch (head_dim) { \
|
||||
case 64: { \
|
||||
constexpr size_t HEAD_DIM = 64; \
|
||||
case 64: { \
|
||||
constexpr size_t HEAD_DIM = 64; \
|
||||
__VA_ARGS__ \
|
||||
break; \
|
||||
} \
|
||||
@@ -385,9 +389,8 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
||||
PD_THROW("not support the cache_type: ", cache_type); \
|
||||
}
|
||||
|
||||
|
||||
#define DISPATCH_DEAL_EACH_TIME(deal_each_time, DEAL_EACH_TIME, ...) \
|
||||
if (deal_each_time == 32) { \
|
||||
if (deal_each_time == 32) { \
|
||||
constexpr size_t DEAL_EACH_TIME = 32; \
|
||||
__VA_ARGS__ \
|
||||
} else if (deal_each_time == 64) { \
|
||||
@@ -404,7 +407,7 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
||||
} else if (num_threads == 256) { \
|
||||
constexpr size_t NUM_THREADS = 256; \
|
||||
__VA_ARGS__ \
|
||||
} else { \
|
||||
} else { \
|
||||
PD_THROW("not support the num_threads", num_threads); \
|
||||
}
|
||||
|
||||
@@ -456,7 +459,7 @@ __forceinline__ __host__ __device__ void vec_cast<nv_bfloat16, float>(
|
||||
}
|
||||
|
||||
#define DISPATCH_MLA_GROUP_SIZE(group_size, GROUP_SIZE, ...) \
|
||||
if (group_size == 8) { \
|
||||
if (group_size == 8) { \
|
||||
constexpr size_t GROUP_SIZE = 8; \
|
||||
__VA_ARGS__ \
|
||||
} else if (group_size == 16) { \
|
||||
@@ -538,9 +541,11 @@ inline HOSTDEVICE T roundWithTiesToEven(T x) {
|
||||
: xUpper);
|
||||
}
|
||||
|
||||
|
||||
template <typename T, bool is_need_kv_quant, bool IsFP8, int RoundType = 0>
|
||||
__host__ __device__ __forceinline__ uint8_t QuantToC8(const T scale, const T value, const float max_bound, const float min_bound) {
|
||||
__host__ __device__ __forceinline__ uint8_t QuantToC8(const T scale,
|
||||
const T value,
|
||||
const float max_bound,
|
||||
const float min_bound) {
|
||||
uint8_t eight_bits;
|
||||
float quant_value;
|
||||
if constexpr (is_need_kv_quant) {
|
||||
@@ -572,8 +577,8 @@ __host__ __device__ __forceinline__ uint8_t QuantToC8(const T scale, const T val
|
||||
return eight_bits;
|
||||
}
|
||||
|
||||
|
||||
template <typename T, bool IsFP8>inline __device__ static void convert_c8(T * result, const uint32_t& source){
|
||||
template <typename T, bool IsFP8>
|
||||
inline __device__ static void convert_c8(T* result, const uint32_t& source) {
|
||||
if constexpr (IsFP8) {
|
||||
convert_fp8(result, source);
|
||||
} else {
|
||||
@@ -583,12 +588,12 @@ template <typename T, bool IsFP8>inline __device__ static void convert_c8(T * re
|
||||
|
||||
constexpr int kWarpSize = 32;
|
||||
|
||||
template<typename T>
|
||||
template <typename T>
|
||||
inline __device__ void WelfordCombine1(T b_m2, T* m2) {
|
||||
*m2 += b_m2;
|
||||
}
|
||||
|
||||
template<typename T, int thread_group_width = kWarpSize>
|
||||
template <typename T, int thread_group_width = kWarpSize>
|
||||
__inline__ __device__ void WelfordWarpReduce(T thread_m2, T* m2) {
|
||||
*m2 = thread_m2;
|
||||
for (int mask = thread_group_width / 2; mask > 0; mask >>= 1) {
|
||||
@@ -597,7 +602,7 @@ __inline__ __device__ void WelfordWarpReduce(T thread_m2, T* m2) {
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, int thread_group_width = kWarpSize>
|
||||
template <typename T, int thread_group_width = kWarpSize>
|
||||
__inline__ __device__ void WelfordWarpAllReduce(T thread_m2, T* m2) {
|
||||
WelfordWarpReduce<T, thread_group_width>(thread_m2, m2);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user