[CI] Reduce execution time for ngram kernel tests (#7242)

2026-04-22 16:07:51 +08:00 · 2026-04-08 16:54:46 +08:00
parent 043f2a16e3
commit 4cd574cf90
2 changed files with 22 additions and 22 deletions
@@ -40,8 +40,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))

 MAX_NGRAM_SIZE = 3
 MAX_DRAFT_TOKENS = 10
-NUM_ITERS = 1000
-WARMUP = 5
+NUM_ITERS = 1
+WARMUP = 1


 def _build_data(batch_size, seq_len, hit_type="low_input", seed=42):
@@ -373,7 +373,7 @@ class TestNgramMatchKernel(unittest.TestCase):

    def test_correctness_varied_seeds(self):
        """Test across multiple random seeds."""
-        for seed in [0, 7, 123, 999]:
+        for seed in [42]:
            with self.subTest(seed=seed):
                data = _make_ngram_test_data(batch_size=8, seed=seed)
                cpu_draft = data["draft_tokens"].copy()
@@ -414,13 +414,13 @@ class TestNgramMatchKernel(unittest.TestCase):
                np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)

    def test_large_batch_long_seq(self):
-        """bsz=256, seq_len=128k — scale the reviewer demanded.
+        """bsz=256, seq_len=16k — scale test.

        Uses high threshold to ensure all batches exercise the parallel search
        path (default threshold=128 would skip all batches at bsz=256).
        """
        high_threshold = 100000
-        data = _make_ngram_test_data(batch_size=256, input_len=131072, max_model_len=131072 + 64, seed=77)
+        data = _make_ngram_test_data(batch_size=256, input_len=16384, max_model_len=16384 + 64, seed=77)
        cpu_draft = data["draft_tokens"].copy()
        cpu_slt = data["seq_lens_this_time"].copy()
        _cpu_ngram_match(
@@ -468,8 +468,8 @@ class TestNgramMatchKernel(unittest.TestCase):
        np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)

    def test_single_batch_long_seq(self):
-        """bsz=1, seq_len=128k — single long sequence."""
-        data = _make_ngram_test_data(batch_size=1, input_len=131072, max_model_len=131072 + 64, seed=88)
+        """bsz=1, seq_len=16k — single long sequence."""
+        data = _make_ngram_test_data(batch_size=1, input_len=16384, max_model_len=16384 + 64, seed=88)
        cpu_draft = data["draft_tokens"].copy()
        cpu_slt = data["seq_lens_this_time"].copy()
        _cpu_ngram_match(
@@ -560,7 +560,7 @@ class TestNgramMatchKernel(unittest.TestCase):
    def test_latency(self):
        """Benchmark: GPU kernel latency vs CPU transfer overhead."""
        # Warmup
-        for _ in range(5):
+        for _ in range(1):
            d = _to_gpu(_make_ngram_test_data(batch_size=32, input_len=512, seed=42))
            self.ngram_match(
                d["input_ids"],
@@ -582,7 +582,7 @@ class TestNgramMatchKernel(unittest.TestCase):
        # GPU path: kernel execution only (pre-created tensors, no data transfer)
        gpu_data = _to_gpu(_make_ngram_test_data(batch_size=32, input_len=512, seed=42))
        cpu_data = _make_ngram_test_data(batch_size=32, input_len=512, seed=42)
-        n_runs = 100
+        n_runs = 1
        paddle.device.synchronize()
        t0 = time.perf_counter()
        for _ in range(n_runs):
@@ -628,7 +628,7 @@ class TestNgramMatchKernel(unittest.TestCase):
        """Benchmark GPU kernel across batch sizes to show Phase 2 scales."""
        batch_sizes = [32, 128, 256, 512, 1024]
        input_len = 512
-        n_runs = 50
+        n_runs = 1
        results = []

        for bsz in batch_sizes:
@@ -637,7 +637,7 @@ class TestNgramMatchKernel(unittest.TestCase):
            cpu_data = _make_ngram_test_data(batch_size=bsz, input_len=input_len, seed=42)

            # Warmup
-            for _ in range(3):
+            for _ in range(1):
                self.ngram_match(
                    gpu_data["input_ids"],
                    gpu_data["input_ids_len"],
@@ -701,9 +701,9 @@ class TestNgramMatchKernel(unittest.TestCase):
        print(f"{'='*72}")

    def test_latency_extreme(self):
-        """Benchmark: GPU kernel at extreme scale (bsz=256, seq_len=128k).
+        """Benchmark: GPU kernel at extreme scale (bsz=256, seq_len=16k).

-        Addresses the NCU profiler worst-case scenario (bsz=256 + 128k)
+        Addresses the NCU profiler worst-case scenario (bsz=256 + 16k)
        raised in review.  Tests with production-realistic thresholds
        (8192, 16384) rather than the unlimited threshold used in
        correctness tests.
@@ -713,8 +713,8 @@ class TestNgramMatchKernel(unittest.TestCase):
            {"threshold": 16384, "label": "threshold=16384"},
        ]
        batch_size = 256
-        input_len = 131072  # 128k
-        n_runs = 1000
+        input_len = 16384
+        n_runs = 1

        # Pre-create tensors once (excluded from timing)
        gpu_data = _to_gpu(
@@ -742,7 +742,7 @@ class TestNgramMatchKernel(unittest.TestCase):
            os.environ["INFER_WITH_REFERENCE_TOKENUM_THRESHOLD"] = str(threshold)
            try:
                # Warmup
-                for _ in range(3):
+                for _ in range(1):
                    self.ngram_match(
                        gpu_data["input_ids"],
                        gpu_data["input_ids_len"],
@@ -789,7 +789,7 @@ class TestNgramMatchKernel(unittest.TestCase):
                    os.environ["INFER_WITH_REFERENCE_TOKENUM_THRESHOLD"] = old_env

            # CPU path: simulate copy-to-CPU-and-back overhead at extreme scale
-            cpu_runs = 50  # fewer runs — CPU copy of 256x128k is slow
+            cpu_runs = 1
            paddle.device.synchronize()
            t0 = time.perf_counter()
            for _ in range(cpu_runs):
@@ -873,7 +873,7 @@ class TestHybridMtpNgramKernel(unittest.TestCase):

    def test_correctness_varied_seeds(self):
        """Test across multiple random seeds."""
-        for seed in [0, 7, 123, 999]:
+        for seed in [42]:
            with self.subTest(seed=seed):
                data = _make_mixed_test_data(batch_size=8, seed=seed)
                cpu_draft = data["draft_tokens"].copy()
@@ -912,13 +912,13 @@ class TestHybridMtpNgramKernel(unittest.TestCase):
                np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)

    def test_large_batch_long_seq(self):
-        """bsz=256, seq_len=128k — scale the reviewer demanded.
+        """bsz=256, seq_len=16k — scale test.

        Uses high threshold to ensure all batches exercise the parallel search
        path (default threshold=1024 would skip many batches at bsz=256).
        """
        high_threshold = 100000
-        data = _make_mixed_test_data(batch_size=256, input_len=131072, pre_ids_len=131072 + 64, seed=77)
+        data = _make_mixed_test_data(batch_size=256, input_len=16384, pre_ids_len=16384 + 64, seed=77)
        cpu_draft = data["draft_tokens"].copy()
        cpu_slt = data["seq_lens_this_time"].copy()
        _cpu_hybrid_mtp_ngram(
@@ -964,8 +964,8 @@ class TestHybridMtpNgramKernel(unittest.TestCase):
        np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)

    def test_single_batch_long_seq(self):
-        """bsz=1, seq_len=128k — single long sequence."""
-        data = _make_mixed_test_data(batch_size=1, input_len=131072, pre_ids_len=131072 + 64, seed=88)
+        """bsz=1, seq_len=16k — single long sequence."""
+        data = _make_mixed_test_data(batch_size=1, input_len=16384, pre_ids_len=16384 + 64, seed=88)
        cpu_draft = data["draft_tokens"].copy()
        cpu_slt = data["seq_lens_this_time"].copy()
        _cpu_hybrid_mtp_ngram(