Files
FastDeploy/tests/metax_ci/test_ernie_21b.py
T
2026-02-09 16:25:23 +08:00

62 lines
1.9 KiB
Python

import os
import unittest
import fastdeploy
os.environ["MACA_VISIBLE_DEVICES"] = "0,1"
os.environ["FD_MOE_BACKEND"] = "cutlass"
os.environ["PADDLE_XCCL_BACKEND"] = "metax_gpu"
os.environ["FLAGS_weight_only_linear_arch"] = "80"
os.environ["FD_METAX_KVCACHE_MEM"] = "8"
os.environ["ENABLE_V1_KVCACHE_SCHEDULER"] = "1"
os.environ["FD_ENC_DEC_BLOCK_NUM"] = "2"
os.environ["FD_SAMPLING_CLASS"] = "rejection"
MODEL_PATH = "/data/models/PaddlePaddle/ERNIE-4.5-21B-A3B-Thinking"
class TestErnie21B(unittest.TestCase):
@classmethod
def setUpClass(cls):
"""Class-level setup that runs once before all tests."""
cls.set_config()
cls.llm = fastdeploy.LLM(
model=MODEL_PATH,
tensor_parallel_size=2,
engine_worker_queue_port=8899,
max_model_len=256,
quantization="wint8",
load_choices="default_v1",
# enable_prefix_caching=False,
disable_custom_all_reduce=True,
# graph_optimization_config={"use_cudagraph": False, "graph_opt_level": 0},
)
cls.sampling_params = fastdeploy.SamplingParams(top_p=0.95, max_tokens=256, temperature=0.6)
@classmethod
def set_config(cls):
"""Set the configuration parameters for the test."""
cls.text_prompt = [
"A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?",
]
cls.text_answer_keyword = ["fiber", "2 + 1 = 3"]
def test_text(self):
outputs = self.llm.generate(self.text_prompt, self.sampling_params)
# prompt = outputs[0].prompt
generated_text = outputs[0].outputs.text
# print(f"Prompt: {prompt!r}")
print(f"Generated: {generated_text!r}")
assert all(keyword in generated_text for keyword in self.text_answer_keyword)
if __name__ == "__main__":
unittest.main()