[CI] Reduce execution time for ngram kernel tests (#7242)

This commit is contained in:
YuBaoku
2026-04-08 16:54:46 +08:00
committed by GitHub
parent 043f2a16e3
commit 4cd574cf90
2 changed files with 22 additions and 22 deletions
@@ -40,8 +40,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
MAX_NGRAM_SIZE = 3
MAX_DRAFT_TOKENS = 10
NUM_ITERS = 1000
WARMUP = 5
NUM_ITERS = 1
WARMUP = 1
def _build_data(batch_size, seq_len, hit_type="low_input", seed=42):
+20 -20
View File
@@ -373,7 +373,7 @@ class TestNgramMatchKernel(unittest.TestCase):
def test_correctness_varied_seeds(self):
"""Test across multiple random seeds."""
for seed in [0, 7, 123, 999]:
for seed in [42]:
with self.subTest(seed=seed):
data = _make_ngram_test_data(batch_size=8, seed=seed)
cpu_draft = data["draft_tokens"].copy()
@@ -414,13 +414,13 @@ class TestNgramMatchKernel(unittest.TestCase):
np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)
def test_large_batch_long_seq(self):
"""bsz=256, seq_len=128k — scale the reviewer demanded.
"""bsz=256, seq_len=16k — scale test.
Uses high threshold to ensure all batches exercise the parallel search
path (default threshold=128 would skip all batches at bsz=256).
"""
high_threshold = 100000
data = _make_ngram_test_data(batch_size=256, input_len=131072, max_model_len=131072 + 64, seed=77)
data = _make_ngram_test_data(batch_size=256, input_len=16384, max_model_len=16384 + 64, seed=77)
cpu_draft = data["draft_tokens"].copy()
cpu_slt = data["seq_lens_this_time"].copy()
_cpu_ngram_match(
@@ -468,8 +468,8 @@ class TestNgramMatchKernel(unittest.TestCase):
np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)
def test_single_batch_long_seq(self):
"""bsz=1, seq_len=128k — single long sequence."""
data = _make_ngram_test_data(batch_size=1, input_len=131072, max_model_len=131072 + 64, seed=88)
"""bsz=1, seq_len=16k — single long sequence."""
data = _make_ngram_test_data(batch_size=1, input_len=16384, max_model_len=16384 + 64, seed=88)
cpu_draft = data["draft_tokens"].copy()
cpu_slt = data["seq_lens_this_time"].copy()
_cpu_ngram_match(
@@ -560,7 +560,7 @@ class TestNgramMatchKernel(unittest.TestCase):
def test_latency(self):
"""Benchmark: GPU kernel latency vs CPU transfer overhead."""
# Warmup
for _ in range(5):
for _ in range(1):
d = _to_gpu(_make_ngram_test_data(batch_size=32, input_len=512, seed=42))
self.ngram_match(
d["input_ids"],
@@ -582,7 +582,7 @@ class TestNgramMatchKernel(unittest.TestCase):
# GPU path: kernel execution only (pre-created tensors, no data transfer)
gpu_data = _to_gpu(_make_ngram_test_data(batch_size=32, input_len=512, seed=42))
cpu_data = _make_ngram_test_data(batch_size=32, input_len=512, seed=42)
n_runs = 100
n_runs = 1
paddle.device.synchronize()
t0 = time.perf_counter()
for _ in range(n_runs):
@@ -628,7 +628,7 @@ class TestNgramMatchKernel(unittest.TestCase):
"""Benchmark GPU kernel across batch sizes to show Phase 2 scales."""
batch_sizes = [32, 128, 256, 512, 1024]
input_len = 512
n_runs = 50
n_runs = 1
results = []
for bsz in batch_sizes:
@@ -637,7 +637,7 @@ class TestNgramMatchKernel(unittest.TestCase):
cpu_data = _make_ngram_test_data(batch_size=bsz, input_len=input_len, seed=42)
# Warmup
for _ in range(3):
for _ in range(1):
self.ngram_match(
gpu_data["input_ids"],
gpu_data["input_ids_len"],
@@ -701,9 +701,9 @@ class TestNgramMatchKernel(unittest.TestCase):
print(f"{'='*72}")
def test_latency_extreme(self):
"""Benchmark: GPU kernel at extreme scale (bsz=256, seq_len=128k).
"""Benchmark: GPU kernel at extreme scale (bsz=256, seq_len=16k).
Addresses the NCU profiler worst-case scenario (bsz=256 + 128k)
Addresses the NCU profiler worst-case scenario (bsz=256 + 16k)
raised in review. Tests with production-realistic thresholds
(8192, 16384) rather than the unlimited threshold used in
correctness tests.
@@ -713,8 +713,8 @@ class TestNgramMatchKernel(unittest.TestCase):
{"threshold": 16384, "label": "threshold=16384"},
]
batch_size = 256
input_len = 131072 # 128k
n_runs = 1000
input_len = 16384
n_runs = 1
# Pre-create tensors once (excluded from timing)
gpu_data = _to_gpu(
@@ -742,7 +742,7 @@ class TestNgramMatchKernel(unittest.TestCase):
os.environ["INFER_WITH_REFERENCE_TOKENUM_THRESHOLD"] = str(threshold)
try:
# Warmup
for _ in range(3):
for _ in range(1):
self.ngram_match(
gpu_data["input_ids"],
gpu_data["input_ids_len"],
@@ -789,7 +789,7 @@ class TestNgramMatchKernel(unittest.TestCase):
os.environ["INFER_WITH_REFERENCE_TOKENUM_THRESHOLD"] = old_env
# CPU path: simulate copy-to-CPU-and-back overhead at extreme scale
cpu_runs = 50 # fewer runs — CPU copy of 256x128k is slow
cpu_runs = 1
paddle.device.synchronize()
t0 = time.perf_counter()
for _ in range(cpu_runs):
@@ -873,7 +873,7 @@ class TestHybridMtpNgramKernel(unittest.TestCase):
def test_correctness_varied_seeds(self):
"""Test across multiple random seeds."""
for seed in [0, 7, 123, 999]:
for seed in [42]:
with self.subTest(seed=seed):
data = _make_mixed_test_data(batch_size=8, seed=seed)
cpu_draft = data["draft_tokens"].copy()
@@ -912,13 +912,13 @@ class TestHybridMtpNgramKernel(unittest.TestCase):
np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)
def test_large_batch_long_seq(self):
"""bsz=256, seq_len=128k — scale the reviewer demanded.
"""bsz=256, seq_len=16k — scale test.
Uses high threshold to ensure all batches exercise the parallel search
path (default threshold=1024 would skip many batches at bsz=256).
"""
high_threshold = 100000
data = _make_mixed_test_data(batch_size=256, input_len=131072, pre_ids_len=131072 + 64, seed=77)
data = _make_mixed_test_data(batch_size=256, input_len=16384, pre_ids_len=16384 + 64, seed=77)
cpu_draft = data["draft_tokens"].copy()
cpu_slt = data["seq_lens_this_time"].copy()
_cpu_hybrid_mtp_ngram(
@@ -964,8 +964,8 @@ class TestHybridMtpNgramKernel(unittest.TestCase):
np.testing.assert_array_equal(gpu_data["draft_tokens"].numpy(), cpu_draft)
def test_single_batch_long_seq(self):
"""bsz=1, seq_len=128k — single long sequence."""
data = _make_mixed_test_data(batch_size=1, input_len=131072, pre_ids_len=131072 + 64, seed=88)
"""bsz=1, seq_len=16k — single long sequence."""
data = _make_mixed_test_data(batch_size=1, input_len=16384, pre_ids_len=16384 + 64, seed=88)
cpu_draft = data["draft_tokens"].copy()
cpu_slt = data["seq_lens_this_time"].copy()
_cpu_hybrid_mtp_ngram(