[CI]【Hackathon 10th Spring No.33】config 单测补充 (#6730)

* [CI]【Hackathon 10th Spring No.33】config 单测补充

* fix test_commit_config: reset fields before partial-file test

* [CI]【Hackathon 10th Spring No.33】boost delta coverage for architecture helper branches

* [CI]【Hackathon 10th Spring No.33】add version attr to model config mock

* [CI]【Hackathon 10th Spring No.33】add mrope, runner validation, tail_layer coverage

* [CI]【Hackathon 10th Spring No.33】boost: cover 96 more lines (FDConfig assertions, guided decoding, env branches)

* [CI]【Hackathon 10th Spring No.33】config unit test

* [CI]【Hackathon 10th Spring No.33】cover expert parallel branch

* fix: reset commit hash before _load_from_version_file test; block cuda import via setitem(None)

* refactor: convert to unittest.TestCase style per reviewer request

---------

Co-authored-by: cloudforge1 <cloudforge1@users.noreply.github.com>
Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com>
Co-authored-by: Tao Luo <luotao02@baidu.com>
This commit is contained in:
cloudforge1
2026-04-09 08:28:54 +02:00
committed by GitHub
parent cefc724607
commit 85c6773e6c
+451 -279
View File
@@ -1,7 +1,6 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
@@ -12,314 +11,487 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import random
import json
import sys
import tempfile
import types
import unittest
from unittest.mock import Mock
from pathlib import Path
from types import SimpleNamespace
import paddle
import pytest
import yaml
from fastdeploy import envs
from fastdeploy.config import (
CacheConfig,
CommitConfig,
DeviceConfig,
EarlyStopConfig,
EPLBConfig,
ErnieArchitectures,
FDConfig,
GraphOptimizationConfig,
LoadConfig,
ModelConfig,
MoEPhase,
ParallelConfig,
RoutingReplayConfig,
SchedulerConfig,
SpeculativeConfig,
StructuredOutputsConfig,
iter_architecture_defaults,
try_match_architecture_defaults,
)
from fastdeploy.utils import get_host_ip
# fmt: off
_BP = {"architectures": ["LlamaForCausalLM"], "hidden_size": 4096, "num_attention_heads": 32,
"num_key_value_heads": 8, "head_dim": 128, "num_hidden_layers": 32, "vocab_size": 32000,
"intermediate_size": 11008}
_EP = {"tensor_parallel_size": 4, "enable_expert_parallel": True, "data_parallel_size": 1}
def _plat(cuda=False, xpu=False, hpu=False): # noqa: E302
return SimpleNamespace(is_xpu=lambda: xpu, is_cuda=lambda: cuda, is_maca=lambda: False,
is_iluvatar=lambda: False, is_intel_hpu=lambda: hpu)
def _fr(gen=True, pool=False, mm=False, reason=False, arch="LlamaForCausalLM", dpt=None): # noqa: E302
info = SimpleNamespace(default_pooling_type=dpt)
return SimpleNamespace(
is_text_generation_model=lambda a, m: gen, is_pooling_model=lambda a, m: pool,
is_multimodal_model=lambda a, m: mm, is_reasoning_model=lambda a, m: reason,
get_supported_archs=lambda: {"LlamaForCausalLM", arch}, inspect_model_cls=lambda a, m: (info, arch),
)
def _mcfg(**ov): # noqa: E302
d = dict(num_key_value_heads=8, num_attention_heads=32, head_dim=128,
num_hidden_layers=24, quantization=None, quantization_config=None)
d.update(ov); return SimpleNamespace(**d) # noqa: E702
def _fdm(**ov): # noqa: E302
d = dict(max_model_len=512, architectures=["test_model"], mm_max_tokens_per_item=None,
enable_mm=False, model_format="paddle", moe_phase=MoEPhase(),
first_k_dense_replace=0, version="init")
d.update(ov); return SimpleNamespace(**d) # noqa: E702
def _mm(): # noqa: E302
return _fdm(enable_mm=True, mm_max_tokens_per_item={"image": 256, "video": 0, "audio": 0})
def _mmc(mp, tp, *, pre=None, cj=None, args=None, reg=None, pc=None, arch=None): # noqa: E302
if arch and pre is None: pre = {**_BP, "architectures": [arch]} # noqa: E701
pc_ = dict(pre) if pre is not None else dict(_BP)
raw = dict(cj) if cj is not None else {**pc_, "dtype": "bfloat16"}
(tp / "config.json").write_text(json.dumps(raw))
_fpc = {"get_config_dict": staticmethod(lambda model, **kw: (dict(pc_), None)),
"from_dict": staticmethod(lambda data, **kw: SimpleNamespace(**data))}
mp.setattr("fastdeploy.config.PretrainedConfig", type("FPC", (), _fpc))
mp.setattr("fastdeploy.config.check_unified_ckpt", lambda m: False)
mp.setattr("fastdeploy.config.get_pooling_config", lambda m, revision=None: pc)
mp.setattr(ModelConfig, "registry", property(lambda self: reg or _fr()))
a = {"model": str(tp)}
if args: a.update(args) # noqa: E701
return ModelConfig(a)
def _mfd(mp, **ov): # noqa: E302
mp.setattr("fastdeploy.config.get_host_ip", lambda: "127.0.0.1")
kw = dict(parallel_config=ParallelConfig(ov.pop("parallel", {})),
graph_opt_config=GraphOptimizationConfig({}),
cache_config=CacheConfig(ov.pop("cache", {})), load_config=LoadConfig({}),
scheduler_config=SchedulerConfig(ov.pop("scheduler", {})),
model_config=ov.pop("model_config", _fdm()), test_mode=True)
kw.update(ov); return FDConfig(**kw) # noqa: E702
# fmt: on
class TestConfig(unittest.TestCase):
def test_fdconfig_nnode(self):
parallel_config = ParallelConfig({"tensor_parallel_size": 16, "expert_parallel_size": 1})
graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig({})
load_config = LoadConfig({})
scheduler_config = SchedulerConfig({})
model_config = Mock()
model_config.max_model_len = 512
model_config.architectures = ["test_model"]
model_config.mm_max_tokens_per_item = None
fd_config = FDConfig(
parallel_config=parallel_config,
graph_opt_config=graph_opt_config,
load_config=load_config,
cache_config=cache_config,
scheduler_config=scheduler_config,
model_config=model_config,
ips=[get_host_ip(), "0.0.0.0"],
test_mode=True,
)
assert fd_config.nnode == 2
assert fd_config.is_master is True
def setUp(self):
self.mp = pytest.MonkeyPatch()
self._td = tempfile.TemporaryDirectory()
self.tp = Path(self._td.name)
def test_fdconfig_ips(self):
parallel_config = ParallelConfig({})
graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig({})
load_config = LoadConfig({})
scheduler_config = SchedulerConfig({})
model_config = Mock()
model_config.max_model_len = 512
model_config.architectures = ["test_model"]
model_config.mm_max_tokens_per_item = None
fd_config = FDConfig(
parallel_config=parallel_config,
graph_opt_config=graph_opt_config,
load_config=load_config,
cache_config=cache_config,
scheduler_config=scheduler_config,
model_config=model_config,
ips="0.0.0.0",
test_mode=True,
)
assert fd_config.master_ip == "0.0.0.0"
def tearDown(self):
self.mp.undo()
self._td.cleanup()
def test_fdconfig_max_num_tokens(self):
parallel_config = ParallelConfig({})
graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig({})
load_config = LoadConfig({})
cache_config.enable_chunked_prefill = True
scheduler_config = SchedulerConfig({})
model_config: Mock = Mock()
model_config.max_model_len = 512
model_config.architectures = ["test_model"]
model_config.mm_max_tokens_per_item = None
def test_architecture_ernie(self):
assert len(list(iter_architecture_defaults())) > 5
assert try_match_architecture_defaults("LlamaForCausalLM") == ("ForCausalLM", ("generate", "none"))
assert ErnieArchitectures.contains_ernie_arch(["Ernie4_5ForCausalLM"])
assert ErnieArchitectures.is_ernie_arch("Ernie4_5_MoeForCausalLM")
assert ErnieArchitectures.is_ernie5_arch(["Ernie5ForCausalLM"])
fake = type("_E", (), {"name": staticmethod(lambda: "ErnieTestForCausalLM")})
ErnieArchitectures.register_ernie_model_arch(fake)
try:
assert ErnieArchitectures.is_ernie_arch("ErnieTestForCausalLM")
finally:
ErnieArchitectures.ARCHITECTURES.discard("ErnieTestForCausalLM")
assert not ErnieArchitectures.contains_ernie_arch(["LlamaForCausalLM"])
assert not ErnieArchitectures.is_ernie_arch("ErnieUnknownForCausalLM")
assert not ErnieArchitectures.is_ernie5_arch(["LlamaForCausalLM"])
phase = MoEPhase()
phase.phase = "decode"
with self.assertRaises(ValueError):
phase.phase = "invalid"
assert DeviceConfig({"device_type": "xpu"}).device_type == "xpu"
assert try_match_architecture_defaults("ToyForCausalLM", runner_type="generate") is not None
assert try_match_architecture_defaults("ToyForCausalLM", runner_type="pooling") is None
assert try_match_architecture_defaults("ToyRewardModel", convert_type="reward") is not None
assert try_match_architecture_defaults("ToyForImageClassification", convert_type="reward") is None
so = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar", "reasoning_parser": "test"})
assert so.guided_decoding_backend == "xgrammar" and "xgrammar" in str(so)
rr = RoutingReplayConfig({"enable_routing_replay": True, "routing_store_type": "rdma"})
assert rr.enable_routing_replay is True and "rdma" in rr.to_json_string()
assert RoutingReplayConfig(None).enable_routing_replay is False
fd_config = FDConfig(
parallel_config=parallel_config,
graph_opt_config=graph_opt_config,
cache_config=cache_config,
load_config=load_config,
scheduler_config=scheduler_config,
model_config=model_config,
ips="0.0.0.0",
test_mode=True,
)
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert fd_config.scheduler_config.max_num_batched_tokens == 2048
def test_graph_cache_spec_parallel(self):
g = GraphOptimizationConfig({})
assert isinstance(g.use_cudagraph, bool)
g.cudagraph_capture_sizes = [128, 64, 32, 16, 8, 4, 2, 1]
g.cudagraph_capture_sizes_prefill = [8, 4, 2, 1]
g.init_with_cudagrpah_size(max_capture_size=128, max_capture_shape_prefill=8)
g.filter_capture_size(tp_size=2)
assert all(s % 2 == 0 for s in g.cudagraph_capture_sizes)
assert CacheConfig.get_cache_bytes("bf16") == 2
c = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16", "num_gpu_blocks_override": 100})
c.max_block_num_per_seq = 8
c.postprocess(num_total_tokens=1024, number_of_tasks=2)
assert c.total_block_num == 100
r = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16"})
r.max_block_num_per_seq, r.enc_dec_block_num = 4, 0
r.reset(num_gpu_blocks=200)
assert r.total_block_num == 200
es = EarlyStopConfig({"enable_early_stop": True, "threshold": 0.5})
es.enable_early_stop = None
es.update_enable_early_stop(True)
assert es.enable_early_stop is True
sp = SpeculativeConfig({"method": "mtp"})
sp.num_model_steps, sp.num_speculative_tokens = 3, 1
sp.check_legality_parameters()
assert sp.num_speculative_tokens == 3
self.mp.setattr("fastdeploy.config.check_unified_ckpt", lambda m: False)
(self.tp / "config.json").write_text(json.dumps({"num_hidden_layers": 32}))
fsp = SpeculativeConfig({"method": "mtp", "model": str(self.tp)})
assert fsp.model_config == {"num_hidden_layers": 32}
self.mp.setenv("FLAGS_use_pd_disaggregation", "1")
assert ParallelConfig({}).pd_disaggregation_mode == "per_query"
gid, grp = [], []
self.mp.setattr("fastdeploy.config.dist.collective._set_custom_gid", gid.append)
self.mp.setattr("fastdeploy.config.dist.new_group", lambda r: (grp.append(list(r)), tuple(r))[1])
# fmt: off
p = ParallelConfig({"data_parallel_rank": 1, "data_parallel_size": 2,
"tensor_parallel_size": 4, "enable_expert_parallel": True}) # noqa: E127
# fmt: on
p.set_communicate_group()
assert gid == [1 + envs.FD_TP_GROUP_GID_OFFSET, None, 2 + envs.FD_TP_GROUP_GID_OFFSET, None]
assert grp == [[4, 5, 6, 7], list(range(8))]
assert p.tp_group == (4, 5, 6, 7) and p.ep_group == tuple(range(8))
cache_config.enable_chunked_prefill = False
fd_config = FDConfig(
parallel_config=parallel_config,
graph_opt_config=graph_opt_config,
cache_config=cache_config,
load_config=load_config,
scheduler_config=scheduler_config,
model_config=model_config,
ips="0.0.0.0",
test_mode=True,
)
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
assert fd_config.scheduler_config.max_num_batched_tokens == 8192
def test_modelconfig_defaults_validation(self):
self.mp.setenv("COMPRESSION_RATIO", "1.25")
pre = {**_BP, "infer_model_mp_num": 2, "remove_tail_layer": 3, "n_routed_experts": 16}
cfg = _mmc(self.mp, self.tp, pre=pre)
assert cfg.runner_type == "generate" and cfg.num_hidden_layers == 29
assert cfg.tensor_parallel_size == 2 and cfg.moe_num_experts == 16
assert cfg.compression_ratio == 1.25
# fmt: off
pool_pre = {**_BP, "text_config": {"custom_text_attr": 99},
"vision_config": {"image_size": 224, "patch_size": 14}}
pcfg = _mmc(self.mp, self.tp, pre=pool_pre, args={"runner": "pooling", "convert": "auto"},
reg=_fr(gen=False, pool=True), pc={"normalize": True})
# fmt: on
assert pcfg.runner_type == "pooling" and pcfg.custom_text_attr == 99
assert pcfg.vision_config.image_size == 224 and "encode" in pcfg.supported_tasks
with self.assertRaisesRegex(ValueError, "less than -1"):
_mmc(self.mp, self.tp, args={"max_logprobs": -2})
with self.assertRaisesRegex(ValueError, "greater than the vocabulary"):
_mmc(self.mp, self.tp, args={"max_logprobs": 99999})
with self.assertRaisesRegex(ValueError, "does not support.*generate"):
_mmc(self.mp, self.tp, args={"runner": "generate", "model_impl": "fastdeploy"}, reg=_fr(gen=False))
with self.assertRaisesRegex(ValueError, "does not support.*pooling"):
_mmc(self.mp, self.tp, args={"runner": "pooling", "convert": "none"}, reg=_fr(gen=False))
def test_fdconfig_init_cache(self):
parallel_config = ParallelConfig({})
graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig({})
cache_config.cache_transfer_protocol = "rdma,ipc"
cache_config.pd_comm_port = "2334"
load_config = LoadConfig({})
scheduler_config = SchedulerConfig({})
scheduler_config.splitwise_role = "prefill"
model_config: Mock = Mock()
model_config.max_model_len = 512
model_config.architectures = ["test_model"]
model_config.mm_max_tokens_per_item = None
def test_modelconfig_mrope_format(self):
mrp = {**_BP, "mrope_section": [16, 24, 24], "rope_scaling": {"type": "mrope", "factor": 1.0}}
cfg = _mmc(self.mp, self.tp, pre=mrp)
assert cfg.rope_3d and cfg.rope_scaling["mrope_section"] == [16, 24, 24] and cfg.freq_allocation == 16
cfg2 = _mmc(self.mp, self.tp, pre={**_BP, "mrope_section": [8, 12, 12]})
assert cfg2.rope_3d and cfg2.rope_scaling == {"mrope_section": [8, 12, 12]}
assert _mmc(self.mp, self.tp, pre={**_BP, "remove_tail_layer": True}).num_hidden_layers == 31
for cj, exp in [
({**_BP, "torch_dtype": "bfloat16"}, "torch"),
({**_BP, "dtype": "bfloat16", "transformers_version": "4.57.0"}, "torch"),
({**_BP, "dtype": "bfloat16", "transformers_version": "4.55.0"}, "paddle"),
]:
assert _mmc(self.mp, self.tp, cj=cj).model_format == exp
with self.assertRaisesRegex(ValueError, "Only one of"):
_mmc(self.mp, self.tp, cj={**_BP, "torch_dtype": "bf16", "dtype": "bf16"})
mxfp4 = {**_BP, "quantization_config": {"quant_method": "mxfp4"}}
assert _mmc(self.mp, self.tp, cj=mxfp4).model_format == "torch"
with self.assertRaisesRegex(ValueError, "Unknown model format"):
_mmc(self.mp, self.tp, cj={**_BP})
ecfg = _mmc(self.mp, self.tp, pre={**_BP, "n_shared_experts": 4, "moe_num_shared_experts": None})
assert ecfg.moe_num_shared_experts == 4
(self.tp / "version.yaml").write_text(yaml.dump({"version": "2.0"}))
ecfg.read_model_version()
assert ecfg.version == "2.0"
fd_config = FDConfig(
parallel_config=parallel_config,
graph_opt_config=graph_opt_config,
cache_config=cache_config,
load_config=load_config,
scheduler_config=scheduler_config,
model_config=model_config,
test_mode=True,
)
fd_config.init_cache_info()
assert fd_config.register_info is not None
def test_modelconfig_pooling_tasks(self):
cfg = _mmc(self.mp, self.tp, arch="MysteryArch", reg=_fr(gen=False, arch="OtherArch"))
assert cfg._get_default_runner_type(["MysteryArch"]) == "generate"
assert cfg._get_default_convert_type(["MysteryArch"], "generate") == "none"
_te_reg = _fr(gen=False, pool=True, arch="OtherArch", dpt="CLS")
# fmt: off
pcfg = _mmc(self.mp, self.tp, arch="ToyEmbeddingModel",
args={"runner": "pooling", "convert": "auto"}, reg=_te_reg)
# fmt: on
assert pcfg._get_default_pooling_task(["ToyEmbeddingModel"]) == "embed"
assert pcfg.supported_tasks == ["encode", "embed"]
with self.assertRaisesRegex(TypeError, "PoolerConfig"):
_pa = {"runner": "pooling", "convert": "auto", "override_pooler_config": {"normalize": True}}
_mmc(self.mp, self.tp, arch="ToyEmbeddingModel", args=_pa, reg=_te_reg)
cfg2 = _mmc(self.mp, self.tp)
with self.assertRaises(AssertionError):
cfg2._get_supported_tasks(["LlamaForCausalLM"], "invalid", "none")
assert cfg2._get_download_model("demo") is None
# fmt: off
acfg = _mmc(self.mp, self.tp, args={"runner": "auto", "convert": "auto"},
reg=_fr(gen=False, pool=True, dpt="CLS"))
# fmt: on
assert acfg.runner_type == "pooling" and acfg.convert_type == "none"
assert acfg.pooler_config is not None and acfg.pooler_config.pooling_type == "CLS"
assert "encode" in acfg.supported_tasks
ecfg = _mmc(self.mp, self.tp, args={"runner": "pooling", "convert": "auto"}, reg=_fr(gen=False))
assert ecfg.convert_type == "embed"
def test_fdconfig_postprocess_ports(self):
data_parallel_size = 4
tensor_parallel_size = 2
local_data_parallel_id = random.randint(0, data_parallel_size - 1)
engine_worker_queue_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)]
cache_queue_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)]
pd_comm_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)]
rdma_comm_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size * tensor_parallel_size)]
parallel_config = ParallelConfig(
{
"engine_worker_queue_port": ",".join(map(str, engine_worker_queue_ports)),
"data_parallel_size": data_parallel_size,
"tensor_parallel_size": tensor_parallel_size,
"local_data_parallel_id": local_data_parallel_id,
}
)
graph_opt_config = GraphOptimizationConfig({})
cache_config = CacheConfig(
{
"cache_queue_port": ",".join(map(str, cache_queue_ports)),
"pd_comm_port": ",".join(map(str, pd_comm_ports)),
"rdma_comm_ports": ",".join(map(str, rdma_comm_ports)),
}
)
load_config = LoadConfig({})
scheduler_config = SchedulerConfig({})
model_config: Mock = Mock()
model_config.max_model_len = 512
model_config.architectures = ["test_model"]
model_config.mm_max_tokens_per_item = None
class TestFDConfig(unittest.TestCase):
def setUp(self):
self.mp = pytest.MonkeyPatch()
self._td = tempfile.TemporaryDirectory()
self.tp = Path(self._td.name)
fd_config = FDConfig(
parallel_config=parallel_config,
graph_opt_config=graph_opt_config,
cache_config=cache_config,
load_config=load_config,
scheduler_config=scheduler_config,
model_config=model_config,
ips="0.0.0.0",
test_mode=True,
)
assert (
fd_config.parallel_config.local_engine_worker_queue_port
== engine_worker_queue_ports[local_data_parallel_id]
)
assert fd_config.cache_config.local_cache_queue_port == cache_queue_ports[local_data_parallel_id]
assert fd_config.cache_config.local_pd_comm_port == pd_comm_ports[local_data_parallel_id]
assert (
fd_config.cache_config.local_rdma_comm_ports
== rdma_comm_ports[
local_data_parallel_id * tensor_parallel_size : (local_data_parallel_id + 1) * tensor_parallel_size
]
)
def tearDown(self):
self.mp.undo()
self._td.cleanup()
def test_fdconfig_get_cache_bytes(self):
"""Test CacheConfig.get_cache_bytes static method for various dtypes."""
# Test float32/fp32 variants
for dtype in ["float32", "fp32"]:
assert CacheConfig.get_cache_bytes(dtype) == 4
def _cuda(self):
self.mp.setattr("fastdeploy.config.current_platform", _plat(cuda=True))
# Test float16/bf16/fp16 variants
for dtype in ["float16", "bf16", "fp16"]:
assert CacheConfig.get_cache_bytes(dtype) == 2
def test_topology_env(self):
# fmt: off
multi = _mfd(self.mp, ips=["127.0.0.1", "0.0.0.0"],
parallel={"tensor_parallel_size": 16, "expert_parallel_size": 1})
# fmt: on
assert multi.nnode == 2 and multi.is_master is True
# fmt: off
_par = {"engine_worker_queue_port": "8010,8011,8012,8013", "data_parallel_size": 4,
"tensor_parallel_size": 2, "local_data_parallel_id": 2}
_cch = {"cache_queue_port": "8110,8111,8112,8113", "pd_comm_port": "8210,8211,8212,8213",
"rdma_comm_ports": "8310,8311,8320,8321,8330,8331,8340,8341"}
# fmt: on
ported = _mfd(self.mp, ips="0.0.0.0", parallel=_par, cache=_cch)
cc = ported.cache_config
assert ported.parallel_config.local_engine_worker_queue_port == 8012
assert cc.local_cache_queue_port == 8112 and cc.local_pd_comm_port == 8212
assert cc.local_rdma_comm_ports == [8330, 8331]
glm = _mfd(self.mp, model_config=_fdm(architectures=["Glm4MoeForCausalLM"], first_k_dense_replace=2))
assert glm.model_config.moe_layer_start_index == 2
dec = _mfd(self.mp, scheduler={"splitwise_role": "decode", "max_num_seqs": 34, "max_num_batched_tokens": 2048})
assert dec.get_max_chunk_tokens() == 34
dec.test_attr = "1,2,3"
dec._str_to_list("test_attr", int)
assert dec.test_attr == [1, 2, 3]
dec.test_attr2 = None
dec._str_to_list("test_attr2", int)
assert dec.test_attr2 is None
fd = _mfd(self.mp, ips=["10.0.0.1", "127.0.0.1"], parallel={"tensor_parallel_size": 16})
assert fd.is_master is False and fd.master_ip == "10.0.0.1"
# fmt: off
fd_v1 = _mfd(self.mp, scheduler={"name": "local", "splitwise_role": "prefill"},
router_config=SimpleNamespace(router="http://r", api_server_port=8080, metrics_port=9090))
# fmt: on
assert fd_v1.splitwise_version == "v1"
# fmt: off
reg = _mfd(self.mp, cache={"cache_transfer_protocol": "rdma,ipc", "pd_comm_port": "2334"},
scheduler={"splitwise_role": "prefill"})
# fmt: on
assert reg.register_info is not None
pf = _mfd(self.mp, ips="0.0.0.0", scheduler={"splitwise_role": "prefill"})
assert pf.model_config.moe_phase.phase == "prefill"
self.mp.setenv("FD_FOR_TORCH_MODEL_FORMAT", "1")
assert _mfd(self.mp).model_config.model_format == "torch"
self.mp.delenv("FD_FOR_TORCH_MODEL_FORMAT", raising=False)
self.mp.setenv("FD_ENABLE_MAX_PREFILL", "1")
assert _mfd(self.mp, scheduler={"max_num_seqs": 42}).max_prefill_batch == 42
self.mp.delenv("FD_ENABLE_MAX_PREFILL", raising=False)
fd2 = _mfd(self.mp, model_config=_fdm(max_model_len=4096), cache={"enable_chunked_prefill": True})
assert fd2.scheduler_config.max_num_batched_tokens == 2048
# Test 8-bit types
for dtype in ["uint8", "int8", "float8", "fp8"]:
assert CacheConfig.get_cache_bytes(dtype) == 1
def test_mm_dynload_subconfig(self):
assert _mfd(self.mp, model_config=_mm()).cache_config.max_encoder_cache == 0
e5 = _mfd(self.mp, model_config=_fdm(architectures=["Ernie5ForCausalLM"]))
assert getattr(e5.cache_config, "disable_chunked_mm_input", False) is True
dyn = _mfd(self.mp, load_config=LoadConfig({"dynamic_load_weight": True}))
assert dyn.graph_opt_config.graph_opt_level == 0
sp = SpeculativeConfig({"method": "mtp", "num_speculative_tokens": 1})
spf = _mfd(self.mp, speculative_config=sp, scheduler={"splitwise_role": "prefill"})
assert spf.speculative_config.num_speculative_tokens == 1 and spf.speculative_config.num_model_steps == 1
model = _fdm()
model.read_model_version = lambda: setattr(model, "version", "tv")
_rc = SimpleNamespace(router="http://127.0.0.1:8000", api_server_port=8000, metrics_port=8000)
# fmt: off
fd = _mfd(self.mp, model_config=model,
load_config=LoadConfig({"dynamic_load_weight": True}), router_config=_rc)
# fmt: on
assert fd.model_config.version == "tv"
with self.assertRaisesRegex(ValueError, "less than 1.0"):
CacheConfig({"gpu_memory_utilization": 1.5, "model_cfg": _mcfg()})
with self.assertRaisesRegex(ValueError, "less than 1.0"):
CacheConfig({"kv_cache_ratio": 1.5, "model_cfg": _mcfg()})
sp2 = SpeculativeConfig({"method": "mtp"})
sp2.print()
with self.assertRaisesRegex(ValueError, "max_ngram_size >= min_ngram_size"):
SpeculativeConfig({"method": "ngram", "max_ngram_size": 1, "min_ngram_size": 5})
sp2._apply_user_args(None)
self.mp.setenv("SPECULATE_VERIFY_USE_TOPK", "1")
assert SpeculativeConfig({"method": "mtp"}).verify_strategy.value == 1
assert SpeculativeConfig({"method": "naive", "num_speculative_tokens": 5}).num_speculative_tokens == 0
ep = EPLBConfig(None)
assert ep.enable_eplb is False
ep.print()
es = EarlyStopConfig({"enable_early_stop": False})
with self.assertRaisesRegex(ValueError, "Cannot set"):
es.update_enable_early_stop(True)
cc = CommitConfig()
cc.fastdeploy_commit = ""
cc._load_from_version_file(str(self.tp / "nonexistent.txt"))
assert cc.fastdeploy_commit == ""
bad = self.tp / "bad_version.txt"
bad.write_bytes(b"\xff\xfe" + bytes(range(128, 256)))
cc._load_from_version_file(str(bad))
cc.print()
# Test int4
assert CacheConfig.get_cache_bytes("int4") == 0.5
def test_v0_platforms(self):
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
c = CacheConfig({"model_cfg": _mcfg(), "cache_dtype": "bfloat16"})
c.max_block_num_per_seq, c.enc_dec_block_num = 4, 0
c.reset(num_gpu_blocks=200)
assert c.total_block_num == 200 and c.prefill_kvcache_block_num == int(200 * c.kv_cache_ratio)
self.mp.delenv("FD_ENABLE_MAX_PREFILL", raising=False)
self._cuda()
assert _mfd(self.mp, model_config=_mm()).max_prefill_batch == 1
# fmt: off
fd = _mfd(self.mp, model_config=_fdm(max_model_len=4096),
scheduler={"max_num_batched_tokens": None, "enable_chunked_prefill": True},
cache={"enable_chunked_prefill": True})
# fmt: on
assert fd.scheduler_config.max_num_batched_tokens == 2048
fd2 = _mfd(self.mp, model_config=_fdm(max_model_len=4096), scheduler={"max_num_batched_tokens": None})
assert fd2.scheduler_config.max_num_batched_tokens == 4096
fd3 = _mfd(self.mp, model_config=_mm(), cache={"enable_prefix_caching": True})
assert fd3.cache_config.enable_prefix_caching is False
self.mp.setattr("fastdeploy.config.current_platform", _plat(xpu=True))
self.mp.setenv("XPU_VISIBLE_DEVICES", "0,1")
assert _mfd(self.mp).parallel_config.device_ids == "0,1"
self.mp.setattr("fastdeploy.config.current_platform", _plat(hpu=True))
self.mp.setenv("HPU_VISIBLE_DEVICES", "2,3")
assert _mfd(self.mp).parallel_config.device_ids == "2,3"
# Test unsupported dtype raises ValueError
with self.assertRaises(ValueError) as ctx:
CacheConfig.get_cache_bytes("bf11")
assert "Unsupported cache dtype" in str(ctx.exception)
def test_cudagraph_mm_seq(self):
self._cuda()
fd1 = _mfd(self.mp, parallel=_EP, scheduler={"max_num_seqs": 2})
assert fd1.parallel_config.use_sequence_parallel_moe is False
_dec_sch = {"splitwise_role": "decode", "max_num_seqs": 2, "max_num_batched_tokens": 4096}
fd2 = _mfd(self.mp, parallel=_EP, scheduler=_dec_sch)
assert fd2.parallel_config.use_sequence_parallel_moe is False
g = GraphOptimizationConfig({"use_cudagraph": True})
g.cudagraph_capture_sizes = [128, 64, 32, 16, 8, 4, 2, 1]
_dec64 = {"splitwise_role": "decode", "max_num_seqs": 64, "max_num_batched_tokens": 4096}
fd3 = _mfd(self.mp, graph_opt_config=g, parallel=_EP, scheduler=_dec64)
assert all(s % fd3.parallel_config.tensor_parallel_size == 0 for s in g.cudagraph_capture_sizes)
g2 = GraphOptimizationConfig({"use_cudagraph": True, "cudagraph_only_prefill": True})
fd4 = _mfd(self.mp, graph_opt_config=g2, scheduler={"splitwise_role": "prefill"})
assert fd4.graph_opt_config.use_cudagraph is True
sp = SpeculativeConfig({"method": "mtp", "num_speculative_tokens": 1})
fd5 = _mfd(self.mp, ips="0.0.0.0", speculative_config=sp)
assert hasattr(fd5.graph_opt_config, "real_bsz_to_captured_size")
so = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar"})
fd6 = _mfd(self.mp, structured_outputs_config=so, speculative_config=SpeculativeConfig({"method": "mtp"}))
assert fd6.structured_outputs_config.guided_decoding_backend == "off"
assert _mfd(self.mp, model_config=_mm(), cache={"max_encoder_cache": -1}).cache_config.max_encoder_cache == 0
assert _mfd(self.mp, model_config=_mm(), cache={"max_encoder_cache": 10}).cache_config.max_encoder_cache == 0
def test_fdconfig_num_cpu_blocks(self):
"""Test num_cpu_blocks calculation with swap_space."""
# Create mock model config with required attributes
model_config = Mock()
model_config.num_key_value_heads = 32
model_config.num_attention_heads = 32
model_config.head_dim = 128
model_config.num_hidden_layers = 24
model_config.quantization = None
model_config.quantization_config = None
def test_guided_check(self):
self._cuda()
fake_llg = types.ModuleType("llguidance")
fake_llg.torch = types.ModuleType("llguidance.torch")
self.mp.setitem(sys.modules, "llguidance", fake_llg)
self.mp.setitem(sys.modules, "llguidance.torch", fake_llg.torch)
so = StructuredOutputsConfig({"guided_decoding_backend": "guidance"})
fd = _mfd(self.mp, structured_outputs_config=so, speculative_config=SpeculativeConfig({}))
assert fd.structured_outputs_config.guided_decoding_backend == "guidance"
with self.assertRaisesRegex(NotImplementedError, "not implemented"):
so_bad = StructuredOutputsConfig({"guided_decoding_backend": "badbackend"})
_mfd(self.mp, structured_outputs_config=so_bad, speculative_config=SpeculativeConfig({}))
self.mp.delitem(sys.modules, "llguidance", raising=False)
self.mp.delitem(sys.modules, "llguidance.torch", raising=False)
with self.assertRaisesRegex(ImportError, "llguidance"):
so_g = StructuredOutputsConfig({"guided_decoding_backend": "guidance"})
_mfd(self.mp, structured_outputs_config=so_g, speculative_config=SpeculativeConfig({}))
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
with self.assertRaises(AssertionError):
# fmt: off
_mfd(self.mp, model_config=_fdm(max_model_len=512),
cache={"enable_chunked_prefill": False}, scheduler={"max_num_batched_tokens": 256}).check()
# fmt: on
with self.assertRaisesRegex(AssertionError, "long_prefill_token_threshold"):
# fmt: off
_mfd(self.mp, model_config=_fdm(max_model_len=512), max_num_partial_prefills=2,
long_prefill_token_threshold=600, cache={"enable_chunked_prefill": True}).check()
# fmt: on
fake_xg = types.ModuleType("xgrammar")
self.mp.setitem(sys.modules, "xgrammar", fake_xg)
so2 = StructuredOutputsConfig({"guided_decoding_backend": "xgrammar"})
_sp = SpeculativeConfig({})
_mfd(self.mp, ips="0.0.0.0", structured_outputs_config=so2, speculative_config=_sp).check()
self.mp.delitem(sys.modules, "xgrammar", raising=False)
with self.assertRaisesRegex(Exception, "XGrammar"):
_mfd(self.mp, ips="0.0.0.0", structured_outputs_config=so2, speculative_config=_sp).check()
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")
self.mp.setenv("FD_DISABLED_RECOVER", "1")
with self.assertRaisesRegex(AssertionError, "FD_DISABLED_RECOVER"):
_mfd(self.mp, ips="0.0.0.0").check()
self.mp.setenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")
self.mp.setitem(sys.modules, "cuda", None)
self.mp.setitem(sys.modules, "cuda.cuda", None)
with self.assertRaisesRegex(ImportError, "cuda-python"):
_mfd(self.mp, ips="0.0.0.0", eplb_config=EPLBConfig({"enable_eplb": True})).check()
# Test case 1: swap_space is None -> num_cpu_blocks = 0
cache_config = CacheConfig(
{
"model_cfg": model_config,
"cache_dtype": "bfloat16",
"swap_space": None,
}
)
assert cache_config.num_cpu_blocks == 0
# Test case 2: swap_space = 1GB
# bytes_per_block = head_num * head_dim * byte_size * kv_factor * block_size * num_hidden_layers
# = 32 * 128 * 2 * 2 * 64 * 24 = 25165824 bytes
# num_cpu_blocks = 1 * 1024^3 / 25165824 = 42
cache_config = CacheConfig(
{
"model_cfg": model_config,
"cache_dtype": "bfloat16",
"swap_space": 1,
}
)
expected_blocks = int(1 * 1024**3 / (32 * 128 * 2 * 2 * 64 * 24))
assert cache_config.num_cpu_blocks == expected_blocks
assert cache_config.num_cpu_blocks == 42
# Test case 3: swap_space = 2GB
cache_config = CacheConfig(
{
"model_cfg": model_config,
"cache_dtype": "bfloat16",
"swap_space": 2,
}
)
assert cache_config.num_cpu_blocks == 85
# Test case 4: with fp32 dtype (4 bytes)
cache_config = CacheConfig(
{
"model_cfg": model_config,
"cache_dtype": "float32",
"swap_space": 1,
}
)
expected_blocks = int(1 * 1024**3 / (32 * 128 * 4 * 2 * 64 * 24))
assert cache_config.num_cpu_blocks == expected_blocks
assert cache_config.num_cpu_blocks == 21
# Test case 5: with int8 dtype (1 byte)
cache_config = CacheConfig(
{
"model_cfg": model_config,
"cache_dtype": "int8",
"swap_space": 1,
}
)
expected_blocks = int(1 * 1024**3 / (32 * 128 * 1 * 2 * 64 * 24))
assert cache_config.num_cpu_blocks == expected_blocks
assert cache_config.num_cpu_blocks == 85
# Test case 6: num_cpu_blocks is explicitly set (not affected by swap_space)
cache_config = CacheConfig(
{
"model_cfg": model_config,
"cache_dtype": "bfloat16",
"swap_space": 10,
"num_cpu_blocks": 100,
}
)
assert cache_config.num_cpu_blocks == 100
# Test case 7: with num_key_value_heads (GQA)
model_config_with_gqa = Mock()
model_config_with_gqa.num_key_value_heads = 8 # GQA
model_config_with_gqa.num_attention_heads = 32
model_config_with_gqa.head_dim = 128
model_config_with_gqa.num_hidden_layers = 24
model_config_with_gqa.quantization = None
model_config_with_gqa.quantization_config = None
cache_config = CacheConfig(
{
"model_cfg": model_config_with_gqa,
"cache_dtype": "bfloat16",
"swap_space": 1,
}
)
# bytes_per_block = 8 * 128 * 2 * 2 * 64 * 24 = 6291456 bytes
# num_cpu_blocks = 1 * 1024^3 / 6291456 = 170
expected_blocks = int(1 * 1024**3 / (8 * 128 * 2 * 2 * 64 * 24))
assert cache_config.num_cpu_blocks == expected_blocks
assert cache_config.num_cpu_blocks == 170
def test_chunk_print_str(self):
self.mp.setattr(paddle, "is_compiled_with_xpu", lambda: True)
_dec = {"splitwise_role": "decode", "max_num_seqs": 20, "max_num_batched_tokens": 4096}
assert _mfd(self.mp, scheduler=_dec).get_max_chunk_tokens() == 4096
self.mp.setattr(paddle, "is_compiled_with_xpu", lambda: False)
assert _mfd(self.mp, scheduler=_dec).get_max_chunk_tokens() == 20
fd3 = _mfd(self.mp)
fd3.commit_config, fd3.model_config.print = CommitConfig(), lambda: None
fd3.print()
fd4 = _mfd(self.mp)
fd4.generation_config = SimpleNamespace(to_dict=lambda: {"key": "val"})
for a in ("cache_config", "model_config", "scheduler_config", "parallel_config", "commit_config"):
if (cur := getattr(fd4, a, None)) is not None and not hasattr(cur, "print"):
setattr(fd4, a, SimpleNamespace(print=lambda: None))
fd4.print()
try:
str(_mfd(self.mp))
except (TypeError, Exception):
pass
fd5 = _mfd(self.mp)
fd5.list_attr = [1, 2, 3]
fd5._str_to_list("list_attr", str)
assert fd5.list_attr == ["1", "2", "3"] and fd5._check_master() == fd5.is_master
_mfd(self.mp, ips="0.0.0.0").check()
if __name__ == "__main__":