Files
FastDeploy/tests/utils/test_config.py
T
cloudforge1 85c6773e6c [CI]【Hackathon 10th Spring No.33】config 单测补充 (#6730)
* [CI]【Hackathon 10th Spring No.33】config 单测补充

* fix test_commit_config: reset fields before partial-file test

* [CI]【Hackathon 10th Spring No.33】boost delta coverage for architecture helper branches

* [CI]【Hackathon 10th Spring No.33】add version attr to model config mock

* [CI]【Hackathon 10th Spring No.33】add mrope, runner validation, tail_layer coverage

* [CI]【Hackathon 10th Spring No.33】boost: cover 96 more lines (FDConfig assertions, guided decoding, env branches)

* [CI]【Hackathon 10th Spring No.33】config unit test

* [CI]【Hackathon 10th Spring No.33】cover expert parallel branch

* fix: reset commit hash before _load_from_version_file test; block cuda import via setitem(None)

* refactor: convert to unittest.TestCase style per reviewer request

---------

Co-authored-by: cloudforge1 <cloudforge1@users.noreply.github.com>
Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com>
Co-authored-by: Tao Luo <luotao02@baidu.com>
2026-04-09 14:28:54 +08:00

499 lines
27 KiB
Python

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import sys
import tempfile
import types
import unittest
from pathlib import Path
from types import SimpleNamespace
import paddle
import pytest
import yaml
from fastdeploy import envs
from fastdeploy.config import (
CacheConfig,
CommitConfig,
DeviceConfig,
EarlyStopConfig,
EPLBConfig,
ErnieArchitectures,
FDConfig,
GraphOptimizationConfig,
LoadConfig,
ModelConfig,
MoEPhase,
ParallelConfig,
RoutingReplayConfig,
SchedulerConfig,
SpeculativeConfig,
StructuredOutputsConfig,
iter_architecture_defaults,
try_match_architecture_defaults,
)
# fmt: off
_BP = {"architectures": ["LlamaForCausalLM"], "hidden_size": 4096, "num_attention_heads": 32,
"num_key_value_heads": 8, "head_dim": 128, "num_hidden_layers": 32, "vocab_size": 32000,
"intermediate_size": 11008}
_EP = {"tensor_parallel_size": 4, "enable_expert_parallel": True, "data_parallel_size": 1}
def _plat(cuda=False, xpu=False, hpu=False): # noqa: E302
return SimpleNamespace(is_xpu=lambda: xpu, is_cuda=lambda: cuda, is_maca=lambda: False,
is_iluvatar=lambda: False, is_intel_hpu=lambda: hpu)
def _fr(gen=True, pool=False, mm=False, reason=False, arch="LlamaForCausalLM", dpt=None): # noqa: E302
info = SimpleNamespace(default_pooling_type=dpt)
return SimpleNamespace(
is_text_generation_model=lambda a, m: gen, is_pooling_model=lambda a, m: pool,
is_multimodal_model=lambda a, m: mm, is_reasoning_model=lambda a, m: reason,
get_supported_archs=lambda: {"LlamaForCausalLM", arch}, inspect_model_cls=lambda a, m: (info, arch),
)
def _mcfg(**ov): # noqa: E302
d = dict(num_key_value_heads=8, num_attention_heads=32, head_dim=128,
num_hidden_layers=24, quantization=None, quantization_config=None)
d.update(ov); return SimpleNamespace(**d) # noqa: E702
def _fdm(**ov): # noqa: E302
d = dict(max_model_len=512, architectures=["test_model"], mm_max_tokens_per_item=None,
enable_mm=False, model_format="paddle", moe_phase=MoEPhase(),
first_k_dense_replace=0, version="init")
d.update(ov); return SimpleNamespace(**d) # noqa: E702
def _mm(): # noqa: E302
return _fdm(enable_mm=True, mm_max_tokens_per_item={"image": 256, "video": 0, "audio": 0})
def _mmc(mp, tp, *, pre=None, cj=None, args=None, reg=None, pc=None, arch=None): # noqa: E302
if arch and pre is None: pre = {**_BP, "architectures": [arch]} # noqa: E701
pc_ = dict(pre) if pre is not None else dict(_BP)
raw = dict(cj) if cj is not None else {**pc_, "dtype": "bfloat16"}
(tp / "config.json").write_text(json.dumps(raw))
_fpc = {"get_config_dict": staticmethod(lambda model, **kw: (dict(pc_), None)),
"from_dict": staticmethod(lambda data, **kw: SimpleNamespace(**data))}
mp.setattr("fastdeploy.config.PretrainedConfig", type("FPC", (), _fpc))
mp.setattr("fastdeploy.config.check_unified_ckpt", lambda m: False)
mp.setattr("fastdeploy.config.get_pooling_config", lambda m, revision=None: pc)
mp.setattr(ModelConfig, "registry", property(lambda self: reg or _fr()))
a = {"model": str(tp)}
if args: a.update(args) # noqa: E701
return ModelConfig(a)
def _mfd(mp, **ov): # noqa: E302
mp.setattr("fastdeploy.config.get_host_ip", lambda: "127.0.0.1")
kw = dict(parallel_config=ParallelConfig(ov.pop("parallel", {})),
graph_opt_config=GraphOptimizationConfig({}),
cache_config=CacheConfig(ov.pop("cache", {})), load_config=LoadConfig({}),
scheduler_config=SchedulerConfig(ov.pop("scheduler", {})),
model_config=ov.pop("model_config", _fdm()), test_mode=True)
kw.update(ov); return FDConfig(**kw) # noqa: E702
# fmt: on
class TestConfig(unittest.TestCase):
def setUp(self):
self.mp = pytest.MonkeyPatch()
self._td = tempfile.TemporaryDirectory()
self.tp = Path(self._td.name)
def tearDown(self):
self.mp.undo()
self._td.cleanup()
def test_architecture_ernie(self):
assert len(list(iter_architecture_defaults())) > 5
assert try_match_architecture_defaults("LlamaForCausalLM") == ("ForCausalLM", ("generate", "none"))
assert ErnieArchitectures.contains_ernie_arch(["Ernie4_5ForCausalLM"])
assert ErnieArchitectures.is_ernie_arch("Ernie4_5_MoeForCausalLM")
assert ErnieArchitectures.is_ernie5_arch(["Ernie5ForCausalLM"])
fake = type("_E", (), {"name": staticmethod(lambda: "ErnieTestForCausalLM")})
ErnieArchitectures.register_ernie_model_arch(fake)
try:
assert ErnieArchitectures.is_ernie_arch("ErnieTestForCausalLM")
finally:
ErnieArchitectures.ARCHITECTURES.discard("ErnieTestForCausalLM")
assert not ErnieArchitectures.contains_ernie_arch(["LlamaForCausalLM"])
assert not ErnieArchitectures.is_ernie_arch("ErnieUnknownForCausalLM")
assert not ErnieArchitectures.is_ernie5_arch(["LlamaForCausalLM"])
phase = MoEPhase()
phase.phase = "decode"
with self.assertRaises(ValueError):
phase.phase = "invalid"
assert DeviceConfig({"device_type": "xpu"}).device_type == "xpu"
assert try_match_architecture_defaults("ToyForCausalLM", runner_type="generate") is not None
assert try_match_architecture_defaults("ToyForCausalLM", runner_type="pooling") is None
assert try_match_architecture_defaults("ToyRewardModel", convert_type="reward") is not None
assert try_match_architecture_defaults("ToyForImageClassification", convert_type="reward") is None
so = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar", "reasoning_parser": "test"})
assert so.guided_decoding_backend == "xgrammar" and "xgrammar" in str(so)
rr = RoutingReplayConfig({"enable_routing_replay": True, "routing_store_type": "rdma"})
assert rr.enable_routing_replay is True and "rdma" in rr.to_json_string()
assert RoutingReplayConfig(None).enable_routing_replay is False
def test_graph_cache_spec_parallel(self):
g = GraphOptimizationConfig({})
assert isinstance(g.use_cudagraph, bool)
g.cudagraph_capture_sizes = [128, 64, 32, 16, 8, 4, 2, 1]
g.cudagraph_capture_sizes_prefill = [8, 4, 2, 1]
g.init_with_cudagrpah_size(max_capture_size=128, max_capture_shape_prefill=8)
g.filter_capture_size(tp_size=2)
assert all(s % 2 == 0 for s in g.cudagraph_capture_sizes)
assert CacheConfig.get_cache_bytes("bf16") == 2
c = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16", "num_gpu_blocks_override": 100})
c.max_block_num_per_seq = 8
c.postprocess(num_total_tokens=1024, number_of_tasks=2)
assert c.total_block_num == 100
r = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16"})
r.max_block_num_per_seq, r.enc_dec_block_num = 4, 0
r.reset(num_gpu_blocks=200)
assert r.total_block_num == 200
es = EarlyStopConfig({"enable_early_stop": True, "threshold": 0.5})
es.enable_early_stop = None
es.update_enable_early_stop(True)
assert es.enable_early_stop is True
sp = SpeculativeConfig({"method": "mtp"})
sp.num_model_steps, sp.num_speculative_tokens = 3, 1
sp.check_legality_parameters()
assert sp.num_speculative_tokens == 3
self.mp.setattr("fastdeploy.config.check_unified_ckpt", lambda m: False)
(self.tp / "config.json").write_text(json.dumps({"num_hidden_layers": 32}))
fsp = SpeculativeConfig({"method": "mtp", "model": str(self.tp)})
assert fsp.model_config == {"num_hidden_layers": 32}
self.mp.setenv("FLAGS_use_pd_disaggregation", "1")
assert ParallelConfig({}).pd_disaggregation_mode == "per_query"
gid, grp = [], []
self.mp.setattr("fastdeploy.config.dist.collective._set_custom_gid", gid.append)
self.mp.setattr("fastdeploy.config.dist.new_group", lambda r: (grp.append(list(r)), tuple(r))[1])
# fmt: off
p = ParallelConfig({"data_parallel_rank": 1, "data_parallel_size": 2,
"tensor_parallel_size": 4, "enable_expert_parallel": True}) # noqa: E127
# fmt: on
p.set_communicate_group()
assert gid == [1 + envs.FD_TP_GROUP_GID_OFFSET, None, 2 + envs.FD_TP_GROUP_GID_OFFSET, None]
assert grp == [[4, 5, 6, 7], list(range(8))]
assert p.tp_group == (4, 5, 6, 7) and p.ep_group == tuple(range(8))
def test_modelconfig_defaults_validation(self):
self.mp.setenv("COMPRESSION_RATIO", "1.25")
pre = {**_BP, "infer_model_mp_num": 2, "remove_tail_layer": 3, "n_routed_experts": 16}
cfg = _mmc(self.mp, self.tp, pre=pre)
assert cfg.runner_type == "generate" and cfg.num_hidden_layers == 29
assert cfg.tensor_parallel_size == 2 and cfg.moe_num_experts == 16
assert cfg.compression_ratio == 1.25
# fmt: off
pool_pre = {**_BP, "text_config": {"custom_text_attr": 99},
"vision_config": {"image_size": 224, "patch_size": 14}}
pcfg = _mmc(self.mp, self.tp, pre=pool_pre, args={"runner": "pooling", "convert": "auto"},
reg=_fr(gen=False, pool=True), pc={"normalize": True})
# fmt: on
assert pcfg.runner_type == "pooling" and pcfg.custom_text_attr == 99
assert pcfg.vision_config.image_size == 224 and "encode" in pcfg.supported_tasks
with self.assertRaisesRegex(ValueError, "less than -1"):
_mmc(self.mp, self.tp, args={"max_logprobs": -2})
with self.assertRaisesRegex(ValueError, "greater than the vocabulary"):
_mmc(self.mp, self.tp, args={"max_logprobs": 99999})
with self.assertRaisesRegex(ValueError, "does not support.*generate"):
_mmc(self.mp, self.tp, args={"runner": "generate", "model_impl": "fastdeploy"}, reg=_fr(gen=False))
with self.assertRaisesRegex(ValueError, "does not support.*pooling"):
_mmc(self.mp, self.tp, args={"runner": "pooling", "convert": "none"}, reg=_fr(gen=False))
def test_modelconfig_mrope_format(self):
mrp = {**_BP, "mrope_section": [16, 24, 24], "rope_scaling": {"type": "mrope", "factor": 1.0}}
cfg = _mmc(self.mp, self.tp, pre=mrp)
assert cfg.rope_3d and cfg.rope_scaling["mrope_section"] == [16, 24, 24] and cfg.freq_allocation == 16
cfg2 = _mmc(self.mp, self.tp, pre={**_BP, "mrope_section": [8, 12, 12]})
assert cfg2.rope_3d and cfg2.rope_scaling == {"mrope_section": [8, 12, 12]}
assert _mmc(self.mp, self.tp, pre={**_BP, "remove_tail_layer": True}).num_hidden_layers == 31
for cj, exp in [
({**_BP, "torch_dtype": "bfloat16"}, "torch"),
({**_BP, "dtype": "bfloat16", "transformers_version": "4.57.0"}, "torch"),
({**_BP, "dtype": "bfloat16", "transformers_version": "4.55.0"}, "paddle"),
]:
assert _mmc(self.mp, self.tp, cj=cj).model_format == exp
with self.assertRaisesRegex(ValueError, "Only one of"):
_mmc(self.mp, self.tp, cj={**_BP, "torch_dtype": "bf16", "dtype": "bf16"})
mxfp4 = {**_BP, "quantization_config": {"quant_method": "mxfp4"}}
assert _mmc(self.mp, self.tp, cj=mxfp4).model_format == "torch"
with self.assertRaisesRegex(ValueError, "Unknown model format"):
_mmc(self.mp, self.tp, cj={**_BP})
ecfg = _mmc(self.mp, self.tp, pre={**_BP, "n_shared_experts": 4, "moe_num_shared_experts": None})
assert ecfg.moe_num_shared_experts == 4
(self.tp / "version.yaml").write_text(yaml.dump({"version": "2.0"}))
ecfg.read_model_version()
assert ecfg.version == "2.0"
def test_modelconfig_pooling_tasks(self):
cfg = _mmc(self.mp, self.tp, arch="MysteryArch", reg=_fr(gen=False, arch="OtherArch"))
assert cfg._get_default_runner_type(["MysteryArch"]) == "generate"
assert cfg._get_default_convert_type(["MysteryArch"], "generate") == "none"
_te_reg = _fr(gen=False, pool=True, arch="OtherArch", dpt="CLS")
# fmt: off
pcfg = _mmc(self.mp, self.tp, arch="ToyEmbeddingModel",
args={"runner": "pooling", "convert": "auto"}, reg=_te_reg)
# fmt: on
assert pcfg._get_default_pooling_task(["ToyEmbeddingModel"]) == "embed"
assert pcfg.supported_tasks == ["encode", "embed"]
with self.assertRaisesRegex(TypeError, "PoolerConfig"):
_pa = {"runner": "pooling", "convert": "auto", "override_pooler_config": {"normalize": True}}
_mmc(self.mp, self.tp, arch="ToyEmbeddingModel", args=_pa, reg=_te_reg)
cfg2 = _mmc(self.mp, self.tp)
with self.assertRaises(AssertionError):
cfg2._get_supported_tasks(["LlamaForCausalLM"], "invalid", "none")
assert cfg2._get_download_model("demo") is None
# fmt: off
acfg = _mmc(self.mp, self.tp, args={"runner": "auto", "convert": "auto"},
reg=_fr(gen=False, pool=True, dpt="CLS"))
# fmt: on
assert acfg.runner_type == "pooling" and acfg.convert_type == "none"
assert acfg.pooler_config is not None and acfg.pooler_config.pooling_type == "CLS"
assert "encode" in acfg.supported_tasks
ecfg = _mmc(self.mp, self.tp, args={"runner": "pooling", "convert": "auto"}, reg=_fr(gen=False))
assert ecfg.convert_type == "embed"
class TestFDConfig(unittest.TestCase):
def setUp(self):
self.mp = pytest.MonkeyPatch()
self._td = tempfile.TemporaryDirectory()
self.tp = Path(self._td.name)
def tearDown(self):
self.mp.undo()
self._td.cleanup()
def _cuda(self):
self.mp.setattr("fastdeploy.config.current_platform", _plat(cuda=True))
def test_topology_env(self):
# fmt: off
multi = _mfd(self.mp, ips=["127.0.0.1", "0.0.0.0"],
parallel={"tensor_parallel_size": 16, "expert_parallel_size": 1})
# fmt: on
assert multi.nnode == 2 and multi.is_master is True
# fmt: off
_par = {"engine_worker_queue_port": "8010,8011,8012,8013", "data_parallel_size": 4,
"tensor_parallel_size": 2, "local_data_parallel_id": 2}
_cch = {"cache_queue_port": "8110,8111,8112,8113", "pd_comm_port": "8210,8211,8212,8213",
"rdma_comm_ports": "8310,8311,8320,8321,8330,8331,8340,8341"}
# fmt: on
ported = _mfd(self.mp, ips="0.0.0.0", parallel=_par, cache=_cch)
cc = ported.cache_config
assert ported.parallel_config.local_engine_worker_queue_port == 8012
assert cc.local_cache_queue_port == 8112 and cc.local_pd_comm_port == 8212
assert cc.local_rdma_comm_ports == [8330, 8331]
glm = _mfd(self.mp, model_config=_fdm(architectures=["Glm4MoeForCausalLM"], first_k_dense_replace=2))
assert glm.model_config.moe_layer_start_index == 2
dec = _mfd(self.mp, scheduler={"splitwise_role": "decode", "max_num_seqs": 34, "max_num_batched_tokens": 2048})
assert dec.get_max_chunk_tokens() == 34
dec.test_attr = "1,2,3"
dec._str_to_list("test_attr", int)
assert dec.test_attr == [1, 2, 3]
dec.test_attr2 = None
dec._str_to_list("test_attr2", int)
assert dec.test_attr2 is None
fd = _mfd(self.mp, ips=["10.0.0.1", "127.0.0.1"], parallel={"tensor_parallel_size": 16})
assert fd.is_master is False and fd.master_ip == "10.0.0.1"
# fmt: off
fd_v1 = _mfd(self.mp, scheduler={"name": "local", "splitwise_role": "prefill"},
router_config=SimpleNamespace(router="http://r", api_server_port=8080, metrics_port=9090))
# fmt: on
assert fd_v1.splitwise_version == "v1"
# fmt: off
reg = _mfd(self.mp, cache={"cache_transfer_protocol": "rdma,ipc", "pd_comm_port": "2334"},
scheduler={"splitwise_role": "prefill"})
# fmt: on
assert reg.register_info is not None
pf = _mfd(self.mp, ips="0.0.0.0", scheduler={"splitwise_role": "prefill"})
assert pf.model_config.moe_phase.phase == "prefill"
self.mp.setenv("FD_FOR_TORCH_MODEL_FORMAT", "1")
assert _mfd(self.mp).model_config.model_format == "torch"
self.mp.delenv("FD_FOR_TORCH_MODEL_FORMAT", raising=False)
self.mp.setenv("FD_ENABLE_MAX_PREFILL", "1")
assert _mfd(self.mp, scheduler={"max_num_seqs": 42}).max_prefill_batch == 42
self.mp.delenv("FD_ENABLE_MAX_PREFILL", raising=False)
fd2 = _mfd(self.mp, model_config=_fdm(max_model_len=4096), cache={"enable_chunked_prefill": True})
assert fd2.scheduler_config.max_num_batched_tokens == 2048
def test_mm_dynload_subconfig(self):
assert _mfd(self.mp, model_config=_mm()).cache_config.max_encoder_cache == 0
e5 = _mfd(self.mp, model_config=_fdm(architectures=["Ernie5ForCausalLM"]))
assert getattr(e5.cache_config, "disable_chunked_mm_input", False) is True
dyn = _mfd(self.mp, load_config=LoadConfig({"dynamic_load_weight": True}))
assert dyn.graph_opt_config.graph_opt_level == 0
sp = SpeculativeConfig({"method": "mtp", "num_speculative_tokens": 1})
spf = _mfd(self.mp, speculative_config=sp, scheduler={"splitwise_role": "prefill"})
assert spf.speculative_config.num_speculative_tokens == 1 and spf.speculative_config.num_model_steps == 1
model = _fdm()
model.read_model_version = lambda: setattr(model, "version", "tv")
_rc = SimpleNamespace(router="http://127.0.0.1:8000", api_server_port=8000, metrics_port=8000)
# fmt: off
fd = _mfd(self.mp, model_config=model,
load_config=LoadConfig({"dynamic_load_weight": True}), router_config=_rc)
# fmt: on
assert fd.model_config.version == "tv"
with self.assertRaisesRegex(ValueError, "less than 1.0"):
CacheConfig({"gpu_memory_utilization": 1.5, "model_cfg": _mcfg()})
with self.assertRaisesRegex(ValueError, "less than 1.0"):
CacheConfig({"kv_cache_ratio": 1.5, "model_cfg": _mcfg()})
sp2 = SpeculativeConfig({"method": "mtp"})
sp2.print()
with self.assertRaisesRegex(ValueError, "max_ngram_size >= min_ngram_size"):
SpeculativeConfig({"method": "ngram", "max_ngram_size": 1, "min_ngram_size": 5})
sp2._apply_user_args(None)
self.mp.setenv("SPECULATE_VERIFY_USE_TOPK", "1")
assert SpeculativeConfig({"method": "mtp"}).verify_strategy.value == 1
assert SpeculativeConfig({"method": "naive", "num_speculative_tokens": 5}).num_speculative_tokens == 0
ep = EPLBConfig(None)
assert ep.enable_eplb is False
ep.print()
es = EarlyStopConfig({"enable_early_stop": False})
with self.assertRaisesRegex(ValueError, "Cannot set"):
es.update_enable_early_stop(True)
cc = CommitConfig()
cc.fastdeploy_commit = ""
cc._load_from_version_file(str(self.tp / "nonexistent.txt"))
assert cc.fastdeploy_commit == ""
bad = self.tp / "bad_version.txt"
bad.write_bytes(b"\xff\xfe" + bytes(range(128, 256)))
cc._load_from_version_file(str(bad))
cc.print()
def test_v0_platforms(self):
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
c = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16"})
c.max_block_num_per_seq, c.enc_dec_block_num = 4, 0
c.reset(num_gpu_blocks=200)
assert c.total_block_num == 200 and c.prefill_kvcache_block_num == int(200 * c.kv_cache_ratio)
self.mp.delenv("FD_ENABLE_MAX_PREFILL", raising=False)
self._cuda()
assert _mfd(self.mp, model_config=_mm()).max_prefill_batch == 1
# fmt: off
fd = _mfd(self.mp, model_config=_fdm(max_model_len=4096),
scheduler={"max_num_batched_tokens": None, "enable_chunked_prefill": True},
cache={"enable_chunked_prefill": True})
# fmt: on
assert fd.scheduler_config.max_num_batched_tokens == 2048
fd2 = _mfd(self.mp, model_config=_fdm(max_model_len=4096), scheduler={"max_num_batched_tokens": None})
assert fd2.scheduler_config.max_num_batched_tokens == 4096
fd3 = _mfd(self.mp, model_config=_mm(), cache={"enable_prefix_caching": True})
assert fd3.cache_config.enable_prefix_caching is False
self.mp.setattr("fastdeploy.config.current_platform", _plat(xpu=True))
self.mp.setenv("XPU_VISIBLE_DEVICES", "0,1")
assert _mfd(self.mp).parallel_config.device_ids == "0,1"
self.mp.setattr("fastdeploy.config.current_platform", _plat(hpu=True))
self.mp.setenv("HPU_VISIBLE_DEVICES", "2,3")
assert _mfd(self.mp).parallel_config.device_ids == "2,3"
def test_cudagraph_mm_seq(self):
self._cuda()
fd1 = _mfd(self.mp, parallel=_EP, scheduler={"max_num_seqs": 2})
assert fd1.parallel_config.use_sequence_parallel_moe is False
_dec_sch = {"splitwise_role": "decode", "max_num_seqs": 2, "max_num_batched_tokens": 4096}
fd2 = _mfd(self.mp, parallel=_EP, scheduler=_dec_sch)
assert fd2.parallel_config.use_sequence_parallel_moe is False
g = GraphOptimizationConfig({"use_cudagraph": True})
g.cudagraph_capture_sizes = [128, 64, 32, 16, 8, 4, 2, 1]
_dec64 = {"splitwise_role": "decode", "max_num_seqs": 64, "max_num_batched_tokens": 4096}
fd3 = _mfd(self.mp, graph_opt_config=g, parallel=_EP, scheduler=_dec64)
assert all(s % fd3.parallel_config.tensor_parallel_size == 0 for s in g.cudagraph_capture_sizes)
g2 = GraphOptimizationConfig({"use_cudagraph": True, "cudagraph_only_prefill": True})
fd4 = _mfd(self.mp, graph_opt_config=g2, scheduler={"splitwise_role": "prefill"})
assert fd4.graph_opt_config.use_cudagraph is True
sp = SpeculativeConfig({"method": "mtp", "num_speculative_tokens": 1})
fd5 = _mfd(self.mp, ips="0.0.0.0", speculative_config=sp)
assert hasattr(fd5.graph_opt_config, "real_bsz_to_captured_size")
so = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar"})
fd6 = _mfd(self.mp, structured_outputs_config=so, speculative_config=SpeculativeConfig({"method": "mtp"}))
assert fd6.structured_outputs_config.guided_decoding_backend == "off"
assert _mfd(self.mp, model_config=_mm(), cache={"max_encoder_cache": -1}).cache_config.max_encoder_cache == 0
assert _mfd(self.mp, model_config=_mm(), cache={"max_encoder_cache": 10}).cache_config.max_encoder_cache == 0
def test_guided_check(self):
self._cuda()
fake_llg = types.ModuleType("llguidance")
fake_llg.torch = types.ModuleType("llguidance.torch")
self.mp.setitem(sys.modules, "llguidance", fake_llg)
self.mp.setitem(sys.modules, "llguidance.torch", fake_llg.torch)
so = StructuredOutputsConfig({"guided_decoding_backend": "guidance"})
fd = _mfd(self.mp, structured_outputs_config=so, speculative_config=SpeculativeConfig({}))
assert fd.structured_outputs_config.guided_decoding_backend == "guidance"
with self.assertRaisesRegex(NotImplementedError, "not implemented"):
so_bad = StructuredOutputsConfig({"guided_decoding_backend": "badbackend"})
_mfd(self.mp, structured_outputs_config=so_bad, speculative_config=SpeculativeConfig({}))
self.mp.delitem(sys.modules, "llguidance", raising=False)
self.mp.delitem(sys.modules, "llguidance.torch", raising=False)
with self.assertRaisesRegex(ImportError, "llguidance"):
so_g = StructuredOutputsConfig({"guided_decoding_backend": "guidance"})
_mfd(self.mp, structured_outputs_config=so_g, speculative_config=SpeculativeConfig({}))
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
with self.assertRaises(AssertionError):
# fmt: off
_mfd(self.mp, model_config=_fdm(max_model_len=512),
cache={"enable_chunked_prefill": False}, scheduler={"max_num_batched_tokens": 256}).check()
# fmt: on
with self.assertRaisesRegex(AssertionError, "long_prefill_token_threshold"):
# fmt: off
_mfd(self.mp, model_config=_fdm(max_model_len=512), max_num_partial_prefills=2,
long_prefill_token_threshold=600, cache={"enable_chunked_prefill": True}).check()
# fmt: on
fake_xg = types.ModuleType("xgrammar")
self.mp.setitem(sys.modules, "xgrammar", fake_xg)
so2 = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar"})
_sp = SpeculativeConfig({})
_mfd(self.mp, ips="0.0.0.0", structured_outputs_config=so2, speculative_config=_sp).check()
self.mp.delitem(sys.modules, "xgrammar", raising=False)
with self.assertRaisesRegex(Exception, "XGrammar"):
_mfd(self.mp, ips="0.0.0.0", structured_outputs_config=so2, speculative_config=_sp).check()
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")
self.mp.setenv("FD_DISABLED_RECOVER", "1")
with self.assertRaisesRegex(AssertionError, "FD_DISABLED_RECOVER"):
_mfd(self.mp, ips="0.0.0.0").check()
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
self.mp.setitem(sys.modules, "cuda", None)
self.mp.setitem(sys.modules, "cuda.cuda", None)
with self.assertRaisesRegex(ImportError, "cuda-python"):
_mfd(self.mp, ips="0.0.0.0", eplb_config=EPLBConfig({"enable_eplb": True})).check()
def test_chunk_print_str(self):
self.mp.setattr(paddle, "is_compiled_with_xpu", lambda: True)
_dec = {"splitwise_role": "decode", "max_num_seqs": 20, "max_num_batched_tokens": 4096}
assert _mfd(self.mp, scheduler=_dec).get_max_chunk_tokens() == 4096
self.mp.setattr(paddle, "is_compiled_with_xpu", lambda: False)
assert _mfd(self.mp, scheduler=_dec).get_max_chunk_tokens() == 20
fd3 = _mfd(self.mp)
fd3.commit_config, fd3.model_config.print = CommitConfig(), lambda: None
fd3.print()
fd4 = _mfd(self.mp)
fd4.generation_config = SimpleNamespace(to_dict=lambda: {"key": "val"})
for a in ("cache_config", "model_config", "scheduler_config", "parallel_config", "commit_config"):
if (cur := getattr(fd4, a, None)) is not None and not hasattr(cur, "print"):
setattr(fd4, a, SimpleNamespace(print=lambda: None))
fd4.print()
try:
str(_mfd(self.mp))
except (TypeError, Exception):
pass
fd5 = _mfd(self.mp)
fd5.list_attr = [1, 2, 3]
fd5._str_to_list("list_attr", str)
assert fd5.list_attr == ["1", "2", "3"] and fd5._check_master() == fd5.is_master
_mfd(self.mp, ips="0.0.0.0").check()
if __name__ == "__main__":
unittest.main()