[CI]【Hackathon 10th Spring No.33】config 单测补充 (#6730)

* [CI]【Hackathon 10th Spring No.33】config 单测补充 * fix test_commit_config: reset fields before partial-file test * [CI]【Hackathon 10th Spring No.33】boost delta coverage for architecture helper branches * [CI]【Hackathon 10th Spring No.33】add version attr to model config mock * [CI]【Hackathon 10th Spring No.33】add mrope, runner validation, tail_layer coverage * [CI]【Hackathon 10th Spring No.33】boost: cover 96 more lines (FDConfig assertions, guided decoding, env branches) * [CI]【Hackathon 10th Spring No.33】config unit test * [CI]【Hackathon 10th Spring No.33】cover expert parallel branch * fix: reset commit hash before _load_from_version_file test; block cuda import via setitem(None) * refactor: convert to unittest.TestCase style per reviewer request --------- Co-authored-by: cloudforge1 <cloudforge1@users.noreply.github.com> Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com> Co-authored-by: Tao Luo <luotao02@baidu.com>
2026-04-23 00:17:25 +08:00 · 2026-04-09 08:28:54 +02:00
parent cefc724607
commit 85c6773e6c
1 changed files with 451 additions and 279 deletions
@@ -1,7 +1,6 @@
-"""
 # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -12,314 +11,487 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""

-import random
+import json
+import sys
+import tempfile
+import types
 import unittest
-from unittest.mock import Mock
+from pathlib import Path
+from types import SimpleNamespace
+
+import paddle
+import pytest
+import yaml

 from fastdeploy import envs
 from fastdeploy.config import (
    CacheConfig,
+    CommitConfig,
+    DeviceConfig,
+    EarlyStopConfig,
+    EPLBConfig,
+    ErnieArchitectures,
    FDConfig,
    GraphOptimizationConfig,
    LoadConfig,
+    ModelConfig,
+    MoEPhase,
    ParallelConfig,
+    RoutingReplayConfig,
    SchedulerConfig,
+    SpeculativeConfig,
+    StructuredOutputsConfig,
+    iter_architecture_defaults,
+    try_match_architecture_defaults,
 )
-from fastdeploy.utils import get_host_ip
+
+# fmt: off
+_BP = {"architectures": ["LlamaForCausalLM"], "hidden_size": 4096, "num_attention_heads": 32,
+       "num_key_value_heads": 8, "head_dim": 128, "num_hidden_layers": 32, "vocab_size": 32000,
+       "intermediate_size": 11008}
+_EP = {"tensor_parallel_size": 4, "enable_expert_parallel": True, "data_parallel_size": 1}
+
+def _plat(cuda=False, xpu=False, hpu=False):  # noqa: E302
+    return SimpleNamespace(is_xpu=lambda: xpu, is_cuda=lambda: cuda, is_maca=lambda: False,
+                           is_iluvatar=lambda: False, is_intel_hpu=lambda: hpu)
+
+def _fr(gen=True, pool=False, mm=False, reason=False, arch="LlamaForCausalLM", dpt=None):  # noqa: E302
+    info = SimpleNamespace(default_pooling_type=dpt)
+    return SimpleNamespace(
+        is_text_generation_model=lambda a, m: gen, is_pooling_model=lambda a, m: pool,
+        is_multimodal_model=lambda a, m: mm, is_reasoning_model=lambda a, m: reason,
+        get_supported_archs=lambda: {"LlamaForCausalLM", arch}, inspect_model_cls=lambda a, m: (info, arch),
+    )
+
+def _mcfg(**ov):  # noqa: E302
+    d = dict(num_key_value_heads=8, num_attention_heads=32, head_dim=128,
+             num_hidden_layers=24, quantization=None, quantization_config=None)
+    d.update(ov); return SimpleNamespace(**d)  # noqa: E702
+
+def _fdm(**ov):  # noqa: E302
+    d = dict(max_model_len=512, architectures=["test_model"], mm_max_tokens_per_item=None,
+             enable_mm=False, model_format="paddle", moe_phase=MoEPhase(),
+             first_k_dense_replace=0, version="init")
+    d.update(ov); return SimpleNamespace(**d)  # noqa: E702
+
+def _mm():  # noqa: E302
+    return _fdm(enable_mm=True, mm_max_tokens_per_item={"image": 256, "video": 0, "audio": 0})
+
+def _mmc(mp, tp, *, pre=None, cj=None, args=None, reg=None, pc=None, arch=None):  # noqa: E302
+    if arch and pre is None: pre = {**_BP, "architectures": [arch]}  # noqa: E701
+    pc_ = dict(pre) if pre is not None else dict(_BP)
+    raw = dict(cj) if cj is not None else {**pc_, "dtype": "bfloat16"}
+    (tp / "config.json").write_text(json.dumps(raw))
+    _fpc = {"get_config_dict": staticmethod(lambda model, **kw: (dict(pc_), None)),
+            "from_dict": staticmethod(lambda data, **kw: SimpleNamespace(**data))}
+    mp.setattr("fastdeploy.config.PretrainedConfig", type("FPC", (), _fpc))
+    mp.setattr("fastdeploy.config.check_unified_ckpt", lambda m: False)
+    mp.setattr("fastdeploy.config.get_pooling_config", lambda m, revision=None: pc)
+    mp.setattr(ModelConfig, "registry", property(lambda self: reg or _fr()))
+    a = {"model": str(tp)}
+    if args: a.update(args)  # noqa: E701
+    return ModelConfig(a)
+
+def _mfd(mp, **ov):  # noqa: E302
+    mp.setattr("fastdeploy.config.get_host_ip", lambda: "127.0.0.1")
+    kw = dict(parallel_config=ParallelConfig(ov.pop("parallel", {})),
+              graph_opt_config=GraphOptimizationConfig({}),
+              cache_config=CacheConfig(ov.pop("cache", {})), load_config=LoadConfig({}),
+              scheduler_config=SchedulerConfig(ov.pop("scheduler", {})),
+              model_config=ov.pop("model_config", _fdm()), test_mode=True)
+    kw.update(ov); return FDConfig(**kw)  # noqa: E702
+# fmt: on


 class TestConfig(unittest.TestCase):
-    def test_fdconfig_nnode(self):
-        parallel_config = ParallelConfig({"tensor_parallel_size": 16, "expert_parallel_size": 1})
-        graph_opt_config = GraphOptimizationConfig({})
-        cache_config = CacheConfig({})
-        load_config = LoadConfig({})
-        scheduler_config = SchedulerConfig({})
-        model_config = Mock()
-        model_config.max_model_len = 512
-        model_config.architectures = ["test_model"]
-        model_config.mm_max_tokens_per_item = None
-        fd_config = FDConfig(
-            parallel_config=parallel_config,
-            graph_opt_config=graph_opt_config,
-            load_config=load_config,
-            cache_config=cache_config,
-            scheduler_config=scheduler_config,
-            model_config=model_config,
-            ips=[get_host_ip(), "0.0.0.0"],
-            test_mode=True,
-        )
-        assert fd_config.nnode == 2
-        assert fd_config.is_master is True
+    def setUp(self):
+        self.mp = pytest.MonkeyPatch()
+        self._td = tempfile.TemporaryDirectory()
+        self.tp = Path(self._td.name)

-    def test_fdconfig_ips(self):
-        parallel_config = ParallelConfig({})
-        graph_opt_config = GraphOptimizationConfig({})
-        cache_config = CacheConfig({})
-        load_config = LoadConfig({})
-        scheduler_config = SchedulerConfig({})
-        model_config = Mock()
-        model_config.max_model_len = 512
-        model_config.architectures = ["test_model"]
-        model_config.mm_max_tokens_per_item = None
-        fd_config = FDConfig(
-            parallel_config=parallel_config,
-            graph_opt_config=graph_opt_config,
-            load_config=load_config,
-            cache_config=cache_config,
-            scheduler_config=scheduler_config,
-            model_config=model_config,
-            ips="0.0.0.0",
-            test_mode=True,
-        )
-        assert fd_config.master_ip == "0.0.0.0"
+    def tearDown(self):
+        self.mp.undo()
+        self._td.cleanup()

-    def test_fdconfig_max_num_tokens(self):
-        parallel_config = ParallelConfig({})
-        graph_opt_config = GraphOptimizationConfig({})
-        cache_config = CacheConfig({})
-        load_config = LoadConfig({})
-        cache_config.enable_chunked_prefill = True
-        scheduler_config = SchedulerConfig({})
-        model_config: Mock = Mock()
-        model_config.max_model_len = 512
-        model_config.architectures = ["test_model"]
-        model_config.mm_max_tokens_per_item = None
+    def test_architecture_ernie(self):
+        assert len(list(iter_architecture_defaults())) > 5
+        assert try_match_architecture_defaults("LlamaForCausalLM") == ("ForCausalLM", ("generate", "none"))
+        assert ErnieArchitectures.contains_ernie_arch(["Ernie4_5ForCausalLM"])
+        assert ErnieArchitectures.is_ernie_arch("Ernie4_5_MoeForCausalLM")
+        assert ErnieArchitectures.is_ernie5_arch(["Ernie5ForCausalLM"])
+        fake = type("_E", (), {"name": staticmethod(lambda: "ErnieTestForCausalLM")})
+        ErnieArchitectures.register_ernie_model_arch(fake)
+        try:
+            assert ErnieArchitectures.is_ernie_arch("ErnieTestForCausalLM")
+        finally:
+            ErnieArchitectures.ARCHITECTURES.discard("ErnieTestForCausalLM")
+        assert not ErnieArchitectures.contains_ernie_arch(["LlamaForCausalLM"])
+        assert not ErnieArchitectures.is_ernie_arch("ErnieUnknownForCausalLM")
+        assert not ErnieArchitectures.is_ernie5_arch(["LlamaForCausalLM"])
+        phase = MoEPhase()
+        phase.phase = "decode"
+        with self.assertRaises(ValueError):
+            phase.phase = "invalid"
+        assert DeviceConfig({"device_type": "xpu"}).device_type == "xpu"
+        assert try_match_architecture_defaults("ToyForCausalLM", runner_type="generate") is not None
+        assert try_match_architecture_defaults("ToyForCausalLM", runner_type="pooling") is None
+        assert try_match_architecture_defaults("ToyRewardModel", convert_type="reward") is not None
+        assert try_match_architecture_defaults("ToyForImageClassification", convert_type="reward") is None
+        so = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar", "reasoning_parser": "test"})
+        assert so.guided_decoding_backend == "xgrammar" and "xgrammar" in str(so)
+        rr = RoutingReplayConfig({"enable_routing_replay": True, "routing_store_type": "rdma"})
+        assert rr.enable_routing_replay is True and "rdma" in rr.to_json_string()
+        assert RoutingReplayConfig(None).enable_routing_replay is False

-        fd_config = FDConfig(
-            parallel_config=parallel_config,
-            graph_opt_config=graph_opt_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            scheduler_config=scheduler_config,
-            model_config=model_config,
-            ips="0.0.0.0",
-            test_mode=True,
-        )
-        if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
-            assert fd_config.scheduler_config.max_num_batched_tokens == 2048
+    def test_graph_cache_spec_parallel(self):
+        g = GraphOptimizationConfig({})
+        assert isinstance(g.use_cudagraph, bool)
+        g.cudagraph_capture_sizes = [128, 64, 32, 16, 8, 4, 2, 1]
+        g.cudagraph_capture_sizes_prefill = [8, 4, 2, 1]
+        g.init_with_cudagrpah_size(max_capture_size=128, max_capture_shape_prefill=8)
+        g.filter_capture_size(tp_size=2)
+        assert all(s % 2 == 0 for s in g.cudagraph_capture_sizes)
+        assert CacheConfig.get_cache_bytes("bf16") == 2
+        c = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16", "num_gpu_blocks_override": 100})
+        c.max_block_num_per_seq = 8
+        c.postprocess(num_total_tokens=1024, number_of_tasks=2)
+        assert c.total_block_num == 100
+        r = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16"})
+        r.max_block_num_per_seq, r.enc_dec_block_num = 4, 0
+        r.reset(num_gpu_blocks=200)
+        assert r.total_block_num == 200
+        es = EarlyStopConfig({"enable_early_stop": True, "threshold": 0.5})
+        es.enable_early_stop = None
+        es.update_enable_early_stop(True)
+        assert es.enable_early_stop is True
+        sp = SpeculativeConfig({"method": "mtp"})
+        sp.num_model_steps, sp.num_speculative_tokens = 3, 1
+        sp.check_legality_parameters()
+        assert sp.num_speculative_tokens == 3
+        self.mp.setattr("fastdeploy.config.check_unified_ckpt", lambda m: False)
+        (self.tp / "config.json").write_text(json.dumps({"num_hidden_layers": 32}))
+        fsp = SpeculativeConfig({"method": "mtp", "model": str(self.tp)})
+        assert fsp.model_config == {"num_hidden_layers": 32}
+        self.mp.setenv("FLAGS_use_pd_disaggregation", "1")
+        assert ParallelConfig({}).pd_disaggregation_mode == "per_query"
+        gid, grp = [], []
+        self.mp.setattr("fastdeploy.config.dist.collective._set_custom_gid", gid.append)
+        self.mp.setattr("fastdeploy.config.dist.new_group", lambda r: (grp.append(list(r)), tuple(r))[1])
+        # fmt: off
+        p = ParallelConfig({"data_parallel_rank": 1, "data_parallel_size": 2,
+                             "tensor_parallel_size": 4, "enable_expert_parallel": True})  # noqa: E127
+        # fmt: on
+        p.set_communicate_group()
+        assert gid == [1 + envs.FD_TP_GROUP_GID_OFFSET, None, 2 + envs.FD_TP_GROUP_GID_OFFSET, None]
+        assert grp == [[4, 5, 6, 7], list(range(8))]
+        assert p.tp_group == (4, 5, 6, 7) and p.ep_group == tuple(range(8))

-        cache_config.enable_chunked_prefill = False
-        fd_config = FDConfig(
-            parallel_config=parallel_config,
-            graph_opt_config=graph_opt_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            scheduler_config=scheduler_config,
-            model_config=model_config,
-            ips="0.0.0.0",
-            test_mode=True,
-        )
-        if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
-            assert fd_config.scheduler_config.max_num_batched_tokens == 8192
+    def test_modelconfig_defaults_validation(self):
+        self.mp.setenv("COMPRESSION_RATIO", "1.25")
+        pre = {**_BP, "infer_model_mp_num": 2, "remove_tail_layer": 3, "n_routed_experts": 16}
+        cfg = _mmc(self.mp, self.tp, pre=pre)
+        assert cfg.runner_type == "generate" and cfg.num_hidden_layers == 29
+        assert cfg.tensor_parallel_size == 2 and cfg.moe_num_experts == 16
+        assert cfg.compression_ratio == 1.25
+        # fmt: off
+        pool_pre = {**_BP, "text_config": {"custom_text_attr": 99},
+                    "vision_config": {"image_size": 224, "patch_size": 14}}
+        pcfg = _mmc(self.mp, self.tp, pre=pool_pre, args={"runner": "pooling", "convert": "auto"},
+                    reg=_fr(gen=False, pool=True), pc={"normalize": True})
+        # fmt: on
+        assert pcfg.runner_type == "pooling" and pcfg.custom_text_attr == 99
+        assert pcfg.vision_config.image_size == 224 and "encode" in pcfg.supported_tasks
+        with self.assertRaisesRegex(ValueError, "less than -1"):
+            _mmc(self.mp, self.tp, args={"max_logprobs": -2})
+        with self.assertRaisesRegex(ValueError, "greater than the vocabulary"):
+            _mmc(self.mp, self.tp, args={"max_logprobs": 99999})
+        with self.assertRaisesRegex(ValueError, "does not support.*generate"):
+            _mmc(self.mp, self.tp, args={"runner": "generate", "model_impl": "fastdeploy"}, reg=_fr(gen=False))
+        with self.assertRaisesRegex(ValueError, "does not support.*pooling"):
+            _mmc(self.mp, self.tp, args={"runner": "pooling", "convert": "none"}, reg=_fr(gen=False))

-    def test_fdconfig_init_cache(self):
-        parallel_config = ParallelConfig({})
-        graph_opt_config = GraphOptimizationConfig({})
-        cache_config = CacheConfig({})
-        cache_config.cache_transfer_protocol = "rdma,ipc"
-        cache_config.pd_comm_port = "2334"
-        load_config = LoadConfig({})
-        scheduler_config = SchedulerConfig({})
-        scheduler_config.splitwise_role = "prefill"
-        model_config: Mock = Mock()
-        model_config.max_model_len = 512
-        model_config.architectures = ["test_model"]
-        model_config.mm_max_tokens_per_item = None
+    def test_modelconfig_mrope_format(self):
+        mrp = {**_BP, "mrope_section": [16, 24, 24], "rope_scaling": {"type": "mrope", "factor": 1.0}}
+        cfg = _mmc(self.mp, self.tp, pre=mrp)
+        assert cfg.rope_3d and cfg.rope_scaling["mrope_section"] == [16, 24, 24] and cfg.freq_allocation == 16
+        cfg2 = _mmc(self.mp, self.tp, pre={**_BP, "mrope_section": [8, 12, 12]})
+        assert cfg2.rope_3d and cfg2.rope_scaling == {"mrope_section": [8, 12, 12]}
+        assert _mmc(self.mp, self.tp, pre={**_BP, "remove_tail_layer": True}).num_hidden_layers == 31
+        for cj, exp in [
+            ({**_BP, "torch_dtype": "bfloat16"}, "torch"),
+            ({**_BP, "dtype": "bfloat16", "transformers_version": "4.57.0"}, "torch"),
+            ({**_BP, "dtype": "bfloat16", "transformers_version": "4.55.0"}, "paddle"),
+        ]:
+            assert _mmc(self.mp, self.tp, cj=cj).model_format == exp
+        with self.assertRaisesRegex(ValueError, "Only one of"):
+            _mmc(self.mp, self.tp, cj={**_BP, "torch_dtype": "bf16", "dtype": "bf16"})
+        mxfp4 = {**_BP, "quantization_config": {"quant_method": "mxfp4"}}
+        assert _mmc(self.mp, self.tp, cj=mxfp4).model_format == "torch"
+        with self.assertRaisesRegex(ValueError, "Unknown model format"):
+            _mmc(self.mp, self.tp, cj={**_BP})
+        ecfg = _mmc(self.mp, self.tp, pre={**_BP, "n_shared_experts": 4, "moe_num_shared_experts": None})
+        assert ecfg.moe_num_shared_experts == 4
+        (self.tp / "version.yaml").write_text(yaml.dump({"version": "2.0"}))
+        ecfg.read_model_version()
+        assert ecfg.version == "2.0"

-        fd_config = FDConfig(
-            parallel_config=parallel_config,
-            graph_opt_config=graph_opt_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            scheduler_config=scheduler_config,
-            model_config=model_config,
-            test_mode=True,
-        )
-        fd_config.init_cache_info()
-        assert fd_config.register_info is not None
+    def test_modelconfig_pooling_tasks(self):
+        cfg = _mmc(self.mp, self.tp, arch="MysteryArch", reg=_fr(gen=False, arch="OtherArch"))
+        assert cfg._get_default_runner_type(["MysteryArch"]) == "generate"
+        assert cfg._get_default_convert_type(["MysteryArch"], "generate") == "none"
+        _te_reg = _fr(gen=False, pool=True, arch="OtherArch", dpt="CLS")
+        # fmt: off
+        pcfg = _mmc(self.mp, self.tp, arch="ToyEmbeddingModel",
+                    args={"runner": "pooling", "convert": "auto"}, reg=_te_reg)
+        # fmt: on
+        assert pcfg._get_default_pooling_task(["ToyEmbeddingModel"]) == "embed"
+        assert pcfg.supported_tasks == ["encode", "embed"]
+        with self.assertRaisesRegex(TypeError, "PoolerConfig"):
+            _pa = {"runner": "pooling", "convert": "auto", "override_pooler_config": {"normalize": True}}
+            _mmc(self.mp, self.tp, arch="ToyEmbeddingModel", args=_pa, reg=_te_reg)
+        cfg2 = _mmc(self.mp, self.tp)
+        with self.assertRaises(AssertionError):
+            cfg2._get_supported_tasks(["LlamaForCausalLM"], "invalid", "none")
+        assert cfg2._get_download_model("demo") is None
+        # fmt: off
+        acfg = _mmc(self.mp, self.tp, args={"runner": "auto", "convert": "auto"},
+                    reg=_fr(gen=False, pool=True, dpt="CLS"))
+        # fmt: on
+        assert acfg.runner_type == "pooling" and acfg.convert_type == "none"
+        assert acfg.pooler_config is not None and acfg.pooler_config.pooling_type == "CLS"
+        assert "encode" in acfg.supported_tasks
+        ecfg = _mmc(self.mp, self.tp, args={"runner": "pooling", "convert": "auto"}, reg=_fr(gen=False))
+        assert ecfg.convert_type == "embed"

-    def test_fdconfig_postprocess_ports(self):
-        data_parallel_size = 4
-        tensor_parallel_size = 2
-        local_data_parallel_id = random.randint(0, data_parallel_size - 1)
-        engine_worker_queue_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)]
-        cache_queue_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)]
-        pd_comm_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)]
-        rdma_comm_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size * tensor_parallel_size)]

-        parallel_config = ParallelConfig(
-            {
-                "engine_worker_queue_port": ",".join(map(str, engine_worker_queue_ports)),
-                "data_parallel_size": data_parallel_size,
-                "tensor_parallel_size": tensor_parallel_size,
-                "local_data_parallel_id": local_data_parallel_id,
-            }
-        )
-        graph_opt_config = GraphOptimizationConfig({})
-        cache_config = CacheConfig(
-            {
-                "cache_queue_port": ",".join(map(str, cache_queue_ports)),
-                "pd_comm_port": ",".join(map(str, pd_comm_ports)),
-                "rdma_comm_ports": ",".join(map(str, rdma_comm_ports)),
-            }
-        )
-        load_config = LoadConfig({})
-        scheduler_config = SchedulerConfig({})
-        model_config: Mock = Mock()
-        model_config.max_model_len = 512
-        model_config.architectures = ["test_model"]
-        model_config.mm_max_tokens_per_item = None
+class TestFDConfig(unittest.TestCase):
+    def setUp(self):
+        self.mp = pytest.MonkeyPatch()
+        self._td = tempfile.TemporaryDirectory()
+        self.tp = Path(self._td.name)

-        fd_config = FDConfig(
-            parallel_config=parallel_config,
-            graph_opt_config=graph_opt_config,
-            cache_config=cache_config,
-            load_config=load_config,
-            scheduler_config=scheduler_config,
-            model_config=model_config,
-            ips="0.0.0.0",
-            test_mode=True,
-        )
-        assert (
-            fd_config.parallel_config.local_engine_worker_queue_port
-            == engine_worker_queue_ports[local_data_parallel_id]
-        )
-        assert fd_config.cache_config.local_cache_queue_port == cache_queue_ports[local_data_parallel_id]
-        assert fd_config.cache_config.local_pd_comm_port == pd_comm_ports[local_data_parallel_id]
-        assert (
-            fd_config.cache_config.local_rdma_comm_ports
-            == rdma_comm_ports[
-                local_data_parallel_id * tensor_parallel_size : (local_data_parallel_id + 1) * tensor_parallel_size
-            ]
-        )
+    def tearDown(self):
+        self.mp.undo()
+        self._td.cleanup()

-    def test_fdconfig_get_cache_bytes(self):
-        """Test CacheConfig.get_cache_bytes static method for various dtypes."""
-        # Test float32/fp32 variants
-        for dtype in ["float32", "fp32"]:
-            assert CacheConfig.get_cache_bytes(dtype) == 4
+    def _cuda(self):
+        self.mp.setattr("fastdeploy.config.current_platform", _plat(cuda=True))

-        # Test float16/bf16/fp16 variants
-        for dtype in ["float16", "bf16", "fp16"]:
-            assert CacheConfig.get_cache_bytes(dtype) == 2
+    def test_topology_env(self):
+        # fmt: off
+        multi = _mfd(self.mp, ips=["127.0.0.1", "0.0.0.0"],
+                     parallel={"tensor_parallel_size": 16, "expert_parallel_size": 1})
+        # fmt: on
+        assert multi.nnode == 2 and multi.is_master is True
+        # fmt: off
+        _par = {"engine_worker_queue_port": "8010,8011,8012,8013", "data_parallel_size": 4,
+                "tensor_parallel_size": 2, "local_data_parallel_id": 2}
+        _cch = {"cache_queue_port": "8110,8111,8112,8113", "pd_comm_port": "8210,8211,8212,8213",
+                "rdma_comm_ports": "8310,8311,8320,8321,8330,8331,8340,8341"}
+        # fmt: on
+        ported = _mfd(self.mp, ips="0.0.0.0", parallel=_par, cache=_cch)
+        cc = ported.cache_config
+        assert ported.parallel_config.local_engine_worker_queue_port == 8012
+        assert cc.local_cache_queue_port == 8112 and cc.local_pd_comm_port == 8212
+        assert cc.local_rdma_comm_ports == [8330, 8331]
+        glm = _mfd(self.mp, model_config=_fdm(architectures=["Glm4MoeForCausalLM"], first_k_dense_replace=2))
+        assert glm.model_config.moe_layer_start_index == 2
+        dec = _mfd(self.mp, scheduler={"splitwise_role": "decode", "max_num_seqs": 34, "max_num_batched_tokens": 2048})
+        assert dec.get_max_chunk_tokens() == 34
+        dec.test_attr = "1,2,3"
+        dec._str_to_list("test_attr", int)
+        assert dec.test_attr == [1, 2, 3]
+        dec.test_attr2 = None
+        dec._str_to_list("test_attr2", int)
+        assert dec.test_attr2 is None
+        fd = _mfd(self.mp, ips=["10.0.0.1", "127.0.0.1"], parallel={"tensor_parallel_size": 16})
+        assert fd.is_master is False and fd.master_ip == "10.0.0.1"
+        # fmt: off
+        fd_v1 = _mfd(self.mp, scheduler={"name": "local", "splitwise_role": "prefill"},
+                     router_config=SimpleNamespace(router="http://r", api_server_port=8080, metrics_port=9090))
+        # fmt: on
+        assert fd_v1.splitwise_version == "v1"
+        # fmt: off
+        reg = _mfd(self.mp, cache={"cache_transfer_protocol": "rdma,ipc", "pd_comm_port": "2334"},
+                   scheduler={"splitwise_role": "prefill"})
+        # fmt: on
+        assert reg.register_info is not None
+        pf = _mfd(self.mp, ips="0.0.0.0", scheduler={"splitwise_role": "prefill"})
+        assert pf.model_config.moe_phase.phase == "prefill"
+        self.mp.setenv("FD_FOR_TORCH_MODEL_FORMAT", "1")
+        assert _mfd(self.mp).model_config.model_format == "torch"
+        self.mp.delenv("FD_FOR_TORCH_MODEL_FORMAT", raising=False)
+        self.mp.setenv("FD_ENABLE_MAX_PREFILL", "1")
+        assert _mfd(self.mp, scheduler={"max_num_seqs": 42}).max_prefill_batch == 42
+        self.mp.delenv("FD_ENABLE_MAX_PREFILL", raising=False)
+        fd2 = _mfd(self.mp, model_config=_fdm(max_model_len=4096), cache={"enable_chunked_prefill": True})
+        assert fd2.scheduler_config.max_num_batched_tokens == 2048

-        # Test 8-bit types
-        for dtype in ["uint8", "int8", "float8", "fp8"]:
-            assert CacheConfig.get_cache_bytes(dtype) == 1
+    def test_mm_dynload_subconfig(self):
+        assert _mfd(self.mp, model_config=_mm()).cache_config.max_encoder_cache == 0
+        e5 = _mfd(self.mp, model_config=_fdm(architectures=["Ernie5ForCausalLM"]))
+        assert getattr(e5.cache_config, "disable_chunked_mm_input", False) is True
+        dyn = _mfd(self.mp, load_config=LoadConfig({"dynamic_load_weight": True}))
+        assert dyn.graph_opt_config.graph_opt_level == 0
+        sp = SpeculativeConfig({"method": "mtp", "num_speculative_tokens": 1})
+        spf = _mfd(self.mp, speculative_config=sp, scheduler={"splitwise_role": "prefill"})
+        assert spf.speculative_config.num_speculative_tokens == 1 and spf.speculative_config.num_model_steps == 1
+        model = _fdm()
+        model.read_model_version = lambda: setattr(model, "version", "tv")
+        _rc = SimpleNamespace(router="http://127.0.0.1:8000", api_server_port=8000, metrics_port=8000)
+        # fmt: off
+        fd = _mfd(self.mp, model_config=model,
+                  load_config=LoadConfig({"dynamic_load_weight": True}), router_config=_rc)
+        # fmt: on
+        assert fd.model_config.version == "tv"
+        with self.assertRaisesRegex(ValueError, "less than 1.0"):
+            CacheConfig({"gpu_memory_utilization": 1.5, "model_cfg": _mcfg()})
+        with self.assertRaisesRegex(ValueError, "less than 1.0"):
+            CacheConfig({"kv_cache_ratio": 1.5, "model_cfg": _mcfg()})
+        sp2 = SpeculativeConfig({"method": "mtp"})
+        sp2.print()
+        with self.assertRaisesRegex(ValueError, "max_ngram_size >= min_ngram_size"):
+            SpeculativeConfig({"method": "ngram", "max_ngram_size": 1, "min_ngram_size": 5})
+        sp2._apply_user_args(None)
+        self.mp.setenv("SPECULATE_VERIFY_USE_TOPK", "1")
+        assert SpeculativeConfig({"method": "mtp"}).verify_strategy.value == 1
+        assert SpeculativeConfig({"method": "naive", "num_speculative_tokens": 5}).num_speculative_tokens == 0
+        ep = EPLBConfig(None)
+        assert ep.enable_eplb is False
+        ep.print()
+        es = EarlyStopConfig({"enable_early_stop": False})
+        with self.assertRaisesRegex(ValueError, "Cannot set"):
+            es.update_enable_early_stop(True)
+        cc = CommitConfig()
+        cc.fastdeploy_commit = ""
+        cc._load_from_version_file(str(self.tp / "nonexistent.txt"))
+        assert cc.fastdeploy_commit == ""
+        bad = self.tp / "bad_version.txt"
+        bad.write_bytes(b"\xff\xfe" + bytes(range(128, 256)))
+        cc._load_from_version_file(str(bad))
+        cc.print()

-        # Test int4
-        assert CacheConfig.get_cache_bytes("int4") == 0.5
+    def test_v0_platforms(self):
+        self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
+        c = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16"})
+        c.max_block_num_per_seq, c.enc_dec_block_num = 4, 0
+        c.reset(num_gpu_blocks=200)
+        assert c.total_block_num == 200 and c.prefill_kvcache_block_num == int(200 * c.kv_cache_ratio)
+        self.mp.delenv("FD_ENABLE_MAX_PREFILL", raising=False)
+        self._cuda()
+        assert _mfd(self.mp, model_config=_mm()).max_prefill_batch == 1
+        # fmt: off
+        fd = _mfd(self.mp, model_config=_fdm(max_model_len=4096),
+                  scheduler={"max_num_batched_tokens": None, "enable_chunked_prefill": True},
+                  cache={"enable_chunked_prefill": True})
+        # fmt: on
+        assert fd.scheduler_config.max_num_batched_tokens == 2048
+        fd2 = _mfd(self.mp, model_config=_fdm(max_model_len=4096), scheduler={"max_num_batched_tokens": None})
+        assert fd2.scheduler_config.max_num_batched_tokens == 4096
+        fd3 = _mfd(self.mp, model_config=_mm(), cache={"enable_prefix_caching": True})
+        assert fd3.cache_config.enable_prefix_caching is False
+        self.mp.setattr("fastdeploy.config.current_platform", _plat(xpu=True))
+        self.mp.setenv("XPU_VISIBLE_DEVICES", "0,1")
+        assert _mfd(self.mp).parallel_config.device_ids == "0,1"
+        self.mp.setattr("fastdeploy.config.current_platform", _plat(hpu=True))
+        self.mp.setenv("HPU_VISIBLE_DEVICES", "2,3")
+        assert _mfd(self.mp).parallel_config.device_ids == "2,3"

-        # Test unsupported dtype raises ValueError
-        with self.assertRaises(ValueError) as ctx:
-            CacheConfig.get_cache_bytes("bf11")
-        assert "Unsupported cache dtype" in str(ctx.exception)
+    def test_cudagraph_mm_seq(self):
+        self._cuda()
+        fd1 = _mfd(self.mp, parallel=_EP, scheduler={"max_num_seqs": 2})
+        assert fd1.parallel_config.use_sequence_parallel_moe is False
+        _dec_sch = {"splitwise_role": "decode", "max_num_seqs": 2, "max_num_batched_tokens": 4096}
+        fd2 = _mfd(self.mp, parallel=_EP, scheduler=_dec_sch)
+        assert fd2.parallel_config.use_sequence_parallel_moe is False
+        g = GraphOptimizationConfig({"use_cudagraph": True})
+        g.cudagraph_capture_sizes = [128, 64, 32, 16, 8, 4, 2, 1]
+        _dec64 = {"splitwise_role": "decode", "max_num_seqs": 64, "max_num_batched_tokens": 4096}
+        fd3 = _mfd(self.mp, graph_opt_config=g, parallel=_EP, scheduler=_dec64)
+        assert all(s % fd3.parallel_config.tensor_parallel_size == 0 for s in g.cudagraph_capture_sizes)
+        g2 = GraphOptimizationConfig({"use_cudagraph": True, "cudagraph_only_prefill": True})
+        fd4 = _mfd(self.mp, graph_opt_config=g2, scheduler={"splitwise_role": "prefill"})
+        assert fd4.graph_opt_config.use_cudagraph is True
+        sp = SpeculativeConfig({"method": "mtp", "num_speculative_tokens": 1})
+        fd5 = _mfd(self.mp, ips="0.0.0.0", speculative_config=sp)
+        assert hasattr(fd5.graph_opt_config, "real_bsz_to_captured_size")
+        so = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar"})
+        fd6 = _mfd(self.mp, structured_outputs_config=so, speculative_config=SpeculativeConfig({"method": "mtp"}))
+        assert fd6.structured_outputs_config.guided_decoding_backend == "off"
+        assert _mfd(self.mp, model_config=_mm(), cache={"max_encoder_cache": -1}).cache_config.max_encoder_cache == 0
+        assert _mfd(self.mp, model_config=_mm(), cache={"max_encoder_cache": 10}).cache_config.max_encoder_cache == 0

-    def test_fdconfig_num_cpu_blocks(self):
-        """Test num_cpu_blocks calculation with swap_space."""
-        # Create mock model config with required attributes
-        model_config = Mock()
-        model_config.num_key_value_heads = 32
-        model_config.num_attention_heads = 32
-        model_config.head_dim = 128
-        model_config.num_hidden_layers = 24
-        model_config.quantization = None
-        model_config.quantization_config = None
+    def test_guided_check(self):
+        self._cuda()
+        fake_llg = types.ModuleType("llguidance")
+        fake_llg.torch = types.ModuleType("llguidance.torch")
+        self.mp.setitem(sys.modules, "llguidance", fake_llg)
+        self.mp.setitem(sys.modules, "llguidance.torch", fake_llg.torch)
+        so = StructuredOutputsConfig({"guided_decoding_backend": "guidance"})
+        fd = _mfd(self.mp, structured_outputs_config=so, speculative_config=SpeculativeConfig({}))
+        assert fd.structured_outputs_config.guided_decoding_backend == "guidance"
+        with self.assertRaisesRegex(NotImplementedError, "not implemented"):
+            so_bad = StructuredOutputsConfig({"guided_decoding_backend": "badbackend"})
+            _mfd(self.mp, structured_outputs_config=so_bad, speculative_config=SpeculativeConfig({}))
+        self.mp.delitem(sys.modules, "llguidance", raising=False)
+        self.mp.delitem(sys.modules, "llguidance.torch", raising=False)
+        with self.assertRaisesRegex(ImportError, "llguidance"):
+            so_g = StructuredOutputsConfig({"guided_decoding_backend": "guidance"})
+            _mfd(self.mp, structured_outputs_config=so_g, speculative_config=SpeculativeConfig({}))
+        self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
+        with self.assertRaises(AssertionError):
+            # fmt: off
+            _mfd(self.mp, model_config=_fdm(max_model_len=512),
+                 cache={"enable_chunked_prefill": False}, scheduler={"max_num_batched_tokens": 256}).check()
+            # fmt: on
+        with self.assertRaisesRegex(AssertionError, "long_prefill_token_threshold"):
+            # fmt: off
+            _mfd(self.mp, model_config=_fdm(max_model_len=512), max_num_partial_prefills=2,
+                 long_prefill_token_threshold=600, cache={"enable_chunked_prefill": True}).check()
+            # fmt: on
+        fake_xg = types.ModuleType("xgrammar")
+        self.mp.setitem(sys.modules, "xgrammar", fake_xg)
+        so2 = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar"})
+        _sp = SpeculativeConfig({})
+        _mfd(self.mp, ips="0.0.0.0", structured_outputs_config=so2, speculative_config=_sp).check()
+        self.mp.delitem(sys.modules, "xgrammar", raising=False)
+        with self.assertRaisesRegex(Exception, "XGrammar"):
+            _mfd(self.mp, ips="0.0.0.0", structured_outputs_config=so2, speculative_config=_sp).check()
+        self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")
+        self.mp.setenv("FD_DISABLED_RECOVER", "1")
+        with self.assertRaisesRegex(AssertionError, "FD_DISABLED_RECOVER"):
+            _mfd(self.mp, ips="0.0.0.0").check()
+        self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
+        self.mp.setitem(sys.modules, "cuda", None)
+        self.mp.setitem(sys.modules, "cuda.cuda", None)
+        with self.assertRaisesRegex(ImportError, "cuda-python"):
+            _mfd(self.mp, ips="0.0.0.0", eplb_config=EPLBConfig({"enable_eplb": True})).check()

-        # Test case 1: swap_space is None -> num_cpu_blocks = 0
-        cache_config = CacheConfig(
-            {
-                "model_cfg": model_config,
-                "cache_dtype": "bfloat16",
-                "swap_space": None,
-            }
-        )
-        assert cache_config.num_cpu_blocks == 0
-
-        # Test case 2: swap_space = 1GB
-        # bytes_per_block = head_num * head_dim * byte_size * kv_factor * block_size * num_hidden_layers
-        #                 = 32 * 128 * 2 * 2 * 64 * 24 = 25165824 bytes
-        # num_cpu_blocks = 1 * 1024^3 / 25165824 = 42
-        cache_config = CacheConfig(
-            {
-                "model_cfg": model_config,
-                "cache_dtype": "bfloat16",
-                "swap_space": 1,
-            }
-        )
-        expected_blocks = int(1 * 1024**3 / (32 * 128 * 2 * 2 * 64 * 24))
-        assert cache_config.num_cpu_blocks == expected_blocks
-        assert cache_config.num_cpu_blocks == 42
-
-        # Test case 3: swap_space = 2GB
-        cache_config = CacheConfig(
-            {
-                "model_cfg": model_config,
-                "cache_dtype": "bfloat16",
-                "swap_space": 2,
-            }
-        )
-        assert cache_config.num_cpu_blocks == 85
-
-        # Test case 4: with fp32 dtype (4 bytes)
-        cache_config = CacheConfig(
-            {
-                "model_cfg": model_config,
-                "cache_dtype": "float32",
-                "swap_space": 1,
-            }
-        )
-        expected_blocks = int(1 * 1024**3 / (32 * 128 * 4 * 2 * 64 * 24))
-        assert cache_config.num_cpu_blocks == expected_blocks
-        assert cache_config.num_cpu_blocks == 21
-
-        # Test case 5: with int8 dtype (1 byte)
-        cache_config = CacheConfig(
-            {
-                "model_cfg": model_config,
-                "cache_dtype": "int8",
-                "swap_space": 1,
-            }
-        )
-        expected_blocks = int(1 * 1024**3 / (32 * 128 * 1 * 2 * 64 * 24))
-        assert cache_config.num_cpu_blocks == expected_blocks
-        assert cache_config.num_cpu_blocks == 85
-
-        # Test case 6: num_cpu_blocks is explicitly set (not affected by swap_space)
-        cache_config = CacheConfig(
-            {
-                "model_cfg": model_config,
-                "cache_dtype": "bfloat16",
-                "swap_space": 10,
-                "num_cpu_blocks": 100,
-            }
-        )
-        assert cache_config.num_cpu_blocks == 100
-
-        # Test case 7: with num_key_value_heads (GQA)
-        model_config_with_gqa = Mock()
-        model_config_with_gqa.num_key_value_heads = 8  # GQA
-        model_config_with_gqa.num_attention_heads = 32
-        model_config_with_gqa.head_dim = 128
-        model_config_with_gqa.num_hidden_layers = 24
-        model_config_with_gqa.quantization = None
-        model_config_with_gqa.quantization_config = None
-
-        cache_config = CacheConfig(
-            {
-                "model_cfg": model_config_with_gqa,
-                "cache_dtype": "bfloat16",
-                "swap_space": 1,
-            }
-        )
-        # bytes_per_block = 8 * 128 * 2 * 2 * 64 * 24 = 6291456 bytes
-        # num_cpu_blocks = 1 * 1024^3 / 6291456 = 170
-        expected_blocks = int(1 * 1024**3 / (8 * 128 * 2 * 2 * 64 * 24))
-        assert cache_config.num_cpu_blocks == expected_blocks
-        assert cache_config.num_cpu_blocks == 170
+    def test_chunk_print_str(self):
+        self.mp.setattr(paddle, "is_compiled_with_xpu", lambda: True)
+        _dec = {"splitwise_role": "decode", "max_num_seqs": 20, "max_num_batched_tokens": 4096}
+        assert _mfd(self.mp, scheduler=_dec).get_max_chunk_tokens() == 4096
+        self.mp.setattr(paddle, "is_compiled_with_xpu", lambda: False)
+        assert _mfd(self.mp, scheduler=_dec).get_max_chunk_tokens() == 20
+        fd3 = _mfd(self.mp)
+        fd3.commit_config, fd3.model_config.print = CommitConfig(), lambda: None
+        fd3.print()
+        fd4 = _mfd(self.mp)
+        fd4.generation_config = SimpleNamespace(to_dict=lambda: {"key": "val"})
+        for a in ("cache_config", "model_config", "scheduler_config", "parallel_config", "commit_config"):
+            if (cur := getattr(fd4, a, None)) is not None and not hasattr(cur, "print"):
+                setattr(fd4, a, SimpleNamespace(print=lambda: None))
+        fd4.print()
+        try:
+            str(_mfd(self.mp))
+        except (TypeError, Exception):
+            pass
+        fd5 = _mfd(self.mp)
+        fd5.list_attr = [1, 2, 3]
+        fd5._str_to_list("list_attr", str)
+        assert fd5.list_attr == ["1", "2", "3"] and fd5._check_master() == fd5.is_master
+        _mfd(self.mp, ips="0.0.0.0").check()


 if __name__ == "__main__":