[Optimization][DeepSeekV3.2]Reducing slot_mapping compute frequency from twice per layer to a single pre-processing step. (#7367)

2026-04-23 08:21:53 +08:00 · 2026-04-16 19:54:12 +08:00
parent d2d633b05c
commit 2d8338f9e4
10 changed files with 73 additions and 146 deletions
@@ -188,6 +188,11 @@ class InputBatch:
        self.cu_seqlens_q = paddle.full([max_num_seqs + 1], 0, dtype="int32")
        self.cu_seqlens_k = paddle.full([max_num_seqs + 1], 0, dtype="int32")

+        # Initialize addressing buffers
+        _max_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        self.position_ids_buffer = paddle.zeros([_max_batched_tokens], dtype=paddle.int32)
+        self.slot_mapping_buffer = paddle.zeros([_max_batched_tokens], dtype=paddle.int64)
+
        # Declare AttentionBackend buffers
        self.decoder_batch_ids = None
        self.decoder_tile_ids_per_batch = None