[Optimization][DeepSeekV3.2]Reducing slot_mapping compute frequency from twice per layer to a single pre-processing step. (#7367)

This commit is contained in:
ShaneGZhu
2026-04-16 19:54:12 +08:00
committed by GitHub
parent d2d633b05c
commit 2d8338f9e4
10 changed files with 73 additions and 146 deletions
+5
View File
@@ -188,6 +188,11 @@ class InputBatch:
self.cu_seqlens_q = paddle.full([max_num_seqs + 1], 0, dtype="int32")
self.cu_seqlens_k = paddle.full([max_num_seqs + 1], 0, dtype="int32")
# Initialize addressing buffers
_max_batched_tokens = self.scheduler_config.max_num_batched_tokens
self.position_ids_buffer = paddle.zeros([_max_batched_tokens], dtype=paddle.int32)
self.slot_mapping_buffer = paddle.zeros([_max_batched_tokens], dtype=paddle.int64)
# Declare AttentionBackend buffers
self.decoder_batch_ids = None
self.decoder_tile_ids_per_batch = None