mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature][OP] Add batch-invariant RMSNorm kernel and TP embedding Custom AR path (#6749)
* [Feature] Add batch-invariant RMSNorm kernel and TP embedding Custom AR path - Add Triton-based rms_norm_batch_invariant kernel for M-invariant RMSNorm - Add linear/linear_v2 tracking wrappers in batch_invariant_mode - Route TP VocabParallelEmbedding through Custom AR instead of NCCL - Increase FD_CUSTOM_AR_MAX_SIZE_MB default from 8 to 64 - Add unit tests for RMSNorm and TP embedding invariance * [Fix] Fix test tolerances for bfloat16 RMSNorm and custom AR buffer size - Relax bfloat16 atol from 1e-3 to 1e-2 for D=3584 in RMSNorm numerical correctness test (0.0078125 diff is expected at bfloat16 precision) - Update test_communication expected buffer size from 8MB to 64MB to match FD_CUSTOM_AR_MAX_SIZE_MB default change in envs.py Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Add RMSNorm layer batch_invariant_mode unit test for coverage Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Add pragma no cover for Triton kernel and multi-GPU embedding path Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: gongweibao <gognweibao@baidu.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,7 @@
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
|
||||
#include "helper.h" // For getEnvDeterministicMode, getEnvDeterministicDebug
|
||||
#include "helper.h" // For getBoolEnv
|
||||
#include "multiquery_attention_c16_kernel.h"
|
||||
|
||||
template <typename T,
|
||||
@@ -33,7 +33,7 @@ __global__ void multi_query_append_attention_kernel(
|
||||
const T *__restrict__ q, // [token_num, (num_heads + 2* kv_num_head) *
|
||||
// head_dim]
|
||||
const T *__restrict__ cache_k, // [max_block_num, num_heads, block_size,
|
||||
// head_dim]
|
||||
// head_dim]
|
||||
const T *__restrict__ cache_v,
|
||||
const T *__restrict__ shift_bias, // [q_num_heads * HEAD_DIM]
|
||||
const T *__restrict__ smooth_weight, // [q_num_heads * HEAD_DIM]
|
||||
@@ -54,9 +54,9 @@ __global__ void multi_query_append_attention_kernel(
|
||||
const uint32_t chunk_size,
|
||||
const int num_blocks_x_cpu,
|
||||
T *__restrict__ tmp_workspace, // split kv [token_num, num_chunks,
|
||||
// num_heads, head_dim]
|
||||
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
|
||||
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
|
||||
// num_heads, head_dim]
|
||||
float *__restrict__ tmp_m, // [token_num, num_chunks, num_heads]
|
||||
float *__restrict__ tmp_d, // [token_num, num_chunks, num_heads]
|
||||
OutT *__restrict__ out,
|
||||
const int speculate_max_draft_token_num = 5,
|
||||
const int sliding_window = 0,
|
||||
|
||||
Reference in New Issue
Block a user