FastDeploy/tests/operators/test_build_sampling_params.py

# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import numpy as np
import paddle

from fastdeploy.model_executor.ops.gpu import build_sampling_params

MAX_INFER_SEED = 9223372036854775806
BLOCK_DIM = 64


def build_sampling_params_ref(
    top_p,
    top_k,
    infer_seed,
    cu_seq_lens_q_output,
    token_num_output_cpu,
    increment_value,
):
    """
    Python reference implementation for BuildSamplingParamsKernel.

    Returns:
        top_p_padding:  float32[token_num_output_cpu, 1]
        top_k_padding:  int64[token_num_output_cpu, 1]
        topp_seed:      int64[token_num_output_cpu, 1]
        infer_seed:     int64[real_bsz] (updated in-place)
    """
    real_bsz = len(top_p)
    top_p_padding = np.zeros((token_num_output_cpu, 1), dtype=np.float32)
    top_k_padding = np.zeros((token_num_output_cpu, 1), dtype=np.int64)
    topp_seed = np.zeros((token_num_output_cpu, 1), dtype=np.int64)
    infer_seed = infer_seed.copy()

    for bi in range(real_bsz):
        cur_start = cu_seq_lens_q_output[bi]
        cur_end = cu_seq_lens_q_output[bi + 1]
        bi_top_p = top_p[bi]
        bi_top_k = top_k[bi]

        for tid in range(BLOCK_DIM):
            bi_infer_seed = (infer_seed[bi] + tid * 4) % MAX_INFER_SEED
            i = tid
            while i < cur_end - cur_start:
                pad_idx = cur_start + i
                top_p_padding[pad_idx, 0] = bi_top_p
                top_k_padding[pad_idx, 0] = bi_top_k
                topp_seed[pad_idx, 0] = bi_infer_seed
                bi_infer_seed = (bi_infer_seed + BLOCK_DIM * 4) % MAX_INFER_SEED
                i += BLOCK_DIM

        infer_seed[bi] = (infer_seed[bi] + increment_value) % MAX_INFER_SEED

    return top_p_padding, top_k_padding, topp_seed, infer_seed


def build_inputs(real_bsz, seq_lens_this_time_list, seq_lens_encoder_list, seed=42):
    """
    Helper to build test inputs.

    For prefill requests (seq_lens_encoder > 0), the output length is 1.
    For decode requests (seq_lens_encoder == 0), the output length equals seq_lens_this_time.
    seq_lens_this_time == 0 means the slot is empty, output length is 0.
    """
    rng = np.random.default_rng(seed)

    top_p = rng.uniform(0.0, 1.0, size=(real_bsz,)).astype(np.float32)
    top_k = rng.integers(1, 100, size=(real_bsz,)).astype(np.int64)
    infer_seed = rng.integers(0, MAX_INFER_SEED, size=(real_bsz,)).astype(np.int64)

    seq_lens_this_time = np.array(seq_lens_this_time_list, dtype=np.int32)
    seq_lens_encoder = np.array(seq_lens_encoder_list, dtype=np.int32)

    seq_lens_output = np.zeros(real_bsz, dtype=np.int32)
    for bid in range(real_bsz):
        if seq_lens_this_time[bid] == 0:
            seq_lens_output[bid] = 0
        elif seq_lens_encoder[bid] > 0:
            seq_lens_output[bid] = 1
        else:
            seq_lens_output[bid] = seq_lens_this_time[bid]

    cu_seq_lens_q_output = np.zeros(real_bsz + 1, dtype=np.int32)
    for i in range(real_bsz):
        cu_seq_lens_q_output[i + 1] = cu_seq_lens_q_output[i] + seq_lens_output[i]

    token_num_output_cpu = int(cu_seq_lens_q_output[-1])

    return {
        "top_p": top_p,
        "top_k": top_k,
        "infer_seed": infer_seed,
        "seq_lens_this_time": seq_lens_this_time,
        "cu_seq_lens_q_output": cu_seq_lens_q_output,
        "token_num_output_cpu": token_num_output_cpu,
    }


def run_and_compare(tc, inputs, increment_value):
    """
    Call GPU op and Python reference, compare all outputs.
    """
    t_top_p = paddle.to_tensor(inputs["top_p"], dtype="float32")
    t_top_k = paddle.to_tensor(inputs["top_k"], dtype="int64")
    t_infer_seed = paddle.to_tensor(inputs["infer_seed"], dtype="int64")
    t_seq_lens_this_time = paddle.to_tensor(inputs["seq_lens_this_time"], dtype="int32")
    t_cu_seq_lens_q_output = paddle.to_tensor(inputs["cu_seq_lens_q_output"], dtype="int32")
    token_num_output_cpu = inputs["token_num_output_cpu"]

    gpu_outs = build_sampling_params(
        t_top_p,
        t_top_k,
        t_infer_seed,
        t_seq_lens_this_time,
        t_cu_seq_lens_q_output,
        token_num_output_cpu,
        increment_value,
    )

    ref_outs = build_sampling_params_ref(
        inputs["top_p"],
        inputs["top_k"],
        inputs["infer_seed"],
        inputs["cu_seq_lens_q_output"],
        token_num_output_cpu,
        increment_value,
    )

    np.testing.assert_allclose(gpu_outs[0].numpy(), ref_outs[0], rtol=1e-6, err_msg="Mismatch in top_p_padding")
    np.testing.assert_allclose(gpu_outs[1].numpy(), ref_outs[1], err_msg="Mismatch in top_k_padding")
    np.testing.assert_allclose(gpu_outs[2].numpy(), ref_outs[2], err_msg="Mismatch in topp_seed")
    np.testing.assert_allclose(t_infer_seed.numpy(), ref_outs[3], err_msg="Mismatch in infer_seed (in-place update)")


class TestBuildSamplingParams(unittest.TestCase):
    """Unit tests for build_sampling_params custom operator."""

    # ----------------------------------------------------------------
    # Test 1: exact golden values — mixed prefill and decode
    #   bid=0: decode, seq_lens_this_time=2 => output=2
    #   bid=1: prefill, seq_lens_this_time=10 => output=1
    # ----------------------------------------------------------------
    def test_exact_golden_values(self):
        top_p = np.array([0.9, 0.5], dtype=np.float32)
        top_k = np.array([50, 10], dtype=np.int64)
        infer_seed = np.array([100, 200], dtype=np.int64)
        cu_seq_lens_q_output = np.array([0, 2, 3], dtype=np.int32)
        seq_lens_this_time = np.array([2, 10], dtype=np.int32)

        t_top_p = paddle.to_tensor(top_p, dtype="float32")
        t_top_k = paddle.to_tensor(top_k, dtype="int64")
        t_infer_seed = paddle.to_tensor(infer_seed, dtype="int64")
        t_seq_lens_this_time = paddle.to_tensor(seq_lens_this_time, dtype="int32")
        t_cu_seq_lens_q_output = paddle.to_tensor(cu_seq_lens_q_output, dtype="int32")

        gpu_outs = build_sampling_params(
            t_top_p,
            t_top_k,
            t_infer_seed,
            t_seq_lens_this_time,
            t_cu_seq_lens_q_output,
            3,
            1,
        )

        np.testing.assert_allclose(gpu_outs[0].numpy().flatten(), [0.9, 0.9, 0.5], rtol=1e-6)
        np.testing.assert_allclose(gpu_outs[1].numpy().flatten(), [50, 50, 10])
        # topp_seed: bi=0 tid=0 => 100, bi=0 tid=1 => 104; bi=1 tid=0 => 200
        np.testing.assert_allclose(gpu_outs[2].numpy().flatten(), [100, 104, 200])
        np.testing.assert_allclose(t_infer_seed.numpy(), [101, 201])

    # ----------------------------------------------------------------
    # Test 2: mixed prefill/decode batch with reference comparison
    #   bid=0: decode, seq_lens_this_time=3 => output=3
    #   bid=1: prefill, seq_lens_this_time=50 => output=1
    #   bid=2: decode, seq_lens_this_time=5 => output=5
    #   bid=3: prefill, seq_lens_this_time=100 => output=1
    #   bid=4: empty slot => output=0
    # ----------------------------------------------------------------
    def test_mixed_prefill_decode(self):
        inputs = build_inputs(
            real_bsz=5,
            seq_lens_this_time_list=[3, 50, 5, 100, 0],
            seq_lens_encoder_list=[0, 50, 0, 100, 0],
            seed=300,
        )
        self.assertEqual(inputs["token_num_output_cpu"], 10)
        run_and_compare(self, inputs, increment_value=5)

    # ----------------------------------------------------------------
    # Test 3: random stress test with mixed prefill/decode configs
    # ----------------------------------------------------------------
    def test_random_configs(self):
        configs = [
            {"real_bsz": 8, "max_seq_len": 4, "increment_value": 1, "seed": 700},
            {"real_bsz": 32, "max_seq_len": 16, "increment_value": 16, "seed": 800},
        ]
        for cfg in configs:
            with self.subTest(**cfg):
                rng = np.random.default_rng(cfg["seed"])
                real_bsz = cfg["real_bsz"]
                max_seq_len = cfg["max_seq_len"]
                seq_lens_this_time_list = rng.integers(0, max_seq_len + 1, size=real_bsz).tolist()
                seq_lens_encoder_list = []
                for s in seq_lens_this_time_list:
                    if s > 0 and rng.random() < 0.3:
                        seq_lens_encoder_list.append(s)
                    else:
                        seq_lens_encoder_list.append(0)

                inputs = build_inputs(
                    real_bsz=real_bsz,
                    seq_lens_this_time_list=seq_lens_this_time_list,
                    seq_lens_encoder_list=seq_lens_encoder_list,
                    seed=cfg["seed"],
                )
                if inputs["token_num_output_cpu"] == 0:
                    continue
                run_and_compare(self, inputs, increment_value=cfg["increment_value"])


if __name__ == "__main__":
    unittest.main()