mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
00eb12f656
* [BugFix][Models] avoid custom all-reduce in PaddleFormers fallback TP path and tighten TP-aware layout matching * [BugFix][Models] unify PaddleFormers fused QKV TP loading and align fallback tests
2638 lines
99 KiB
Python
2638 lines
99 KiB
Python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Focused tests to increase coverage of base.py
|
|
Tests actual code paths that were previously uncovered.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock, Mock, patch
|
|
|
|
import numpy as np
|
|
import paddle
|
|
import pytest
|
|
from paddle import nn
|
|
|
|
from fastdeploy.config import (
|
|
CacheConfig,
|
|
FDConfig,
|
|
GraphOptimizationConfig,
|
|
LoadConfig,
|
|
ModelConfig,
|
|
ParallelConfig,
|
|
)
|
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
|
from fastdeploy.model_executor.layers.linear import (
|
|
ColumnParallelLinear,
|
|
RowParallelLinear,
|
|
)
|
|
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersRMSNormWrapper,
|
|
getattr_iter,
|
|
maybe_prefix,
|
|
)
|
|
from fastdeploy.scheduler import SchedulerConfig
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_layer_init_patch():
|
|
"""Patch nn.Layer.__init__ globally for tests using it."""
|
|
|
|
def mock_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with patch.object(nn.Layer, "__init__", mock_init):
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_fd_config():
|
|
"""Create a minimal mock FDConfig for testing."""
|
|
tmp_dir = tempfile.mkdtemp(prefix="test_base_")
|
|
|
|
config_dict = {
|
|
"architectures": ["LlamaForCausalLM"],
|
|
"hidden_size": 4096,
|
|
"intermediate_size": 11008,
|
|
"num_hidden_layers": 2,
|
|
"num_attention_heads": 32,
|
|
"num_key_value_heads": 32,
|
|
"head_dim": 128,
|
|
"vocab_size": 32000,
|
|
"dtype": "float16",
|
|
"rms_norm_eps": 1e-6,
|
|
"rope_theta": 10000.0,
|
|
}
|
|
|
|
config_path = os.path.join(tmp_dir, "config.json")
|
|
with open(config_path, "w") as f:
|
|
json.dump(config_dict, f)
|
|
|
|
model_config = ModelConfig(
|
|
{
|
|
"model": tmp_dir,
|
|
"model_impl": "paddleformers",
|
|
"max_model_len": 2048,
|
|
}
|
|
)
|
|
|
|
parallel_config = ParallelConfig(
|
|
{
|
|
"tensor_parallel_size": 1,
|
|
"data_parallel_size": 1,
|
|
"expert_parallel_size": 1, # Add expert_parallel_size
|
|
"tensor_parallel_rank": 0, # Add tensor_parallel_rank
|
|
}
|
|
)
|
|
parallel_config.tp_group = None
|
|
|
|
scheduler_config = SchedulerConfig({})
|
|
|
|
# Create a proper mock for quant_config with all required attributes
|
|
mock_quant_config = SimpleNamespace(
|
|
quant_round_type=0, # Must be int, not str
|
|
quant_max_bound=1.0,
|
|
quant_min_bound=-1.0,
|
|
)
|
|
mock_quant_config.get_quant_method = lambda self: None # Returns None = no quantization
|
|
|
|
fd_config = FDConfig(
|
|
model_config=model_config,
|
|
parallel_config=parallel_config,
|
|
scheduler_config=scheduler_config,
|
|
cache_config=CacheConfig({}),
|
|
graph_opt_config=GraphOptimizationConfig({}),
|
|
load_config=LoadConfig({}),
|
|
quant_config=mock_quant_config,
|
|
ips="0.0.0.0",
|
|
)
|
|
|
|
yield fd_config, tmp_dir
|
|
|
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_fd_config_tp2():
|
|
"""Create a mock FDConfig with TP=2 for testing."""
|
|
tmp_dir = tempfile.mkdtemp(prefix="test_base_tp2_")
|
|
|
|
config_dict = {
|
|
"architectures": ["Qwen3ForCausalLM"],
|
|
"model_type": "qwen3",
|
|
"hidden_size": 4096,
|
|
"intermediate_size": 11008,
|
|
"num_hidden_layers": 2,
|
|
"num_attention_heads": 32,
|
|
"num_key_value_heads": 8,
|
|
"head_dim": 128,
|
|
"vocab_size": 32000,
|
|
"dtype": "float16",
|
|
"rms_norm_eps": 1e-6,
|
|
"rope_theta": 10000.0,
|
|
}
|
|
|
|
config_path = os.path.join(tmp_dir, "config.json")
|
|
with open(config_path, "w") as f:
|
|
json.dump(config_dict, f)
|
|
|
|
model_config = ModelConfig(
|
|
{
|
|
"model": tmp_dir,
|
|
"model_impl": "paddleformers",
|
|
"max_model_len": 2048,
|
|
}
|
|
)
|
|
|
|
parallel_config = ParallelConfig(
|
|
{
|
|
"tensor_parallel_size": 2, # TP=2
|
|
"data_parallel_size": 1,
|
|
"expert_parallel_size": 1,
|
|
"tensor_parallel_rank": 0,
|
|
}
|
|
)
|
|
parallel_config.tp_group = None
|
|
|
|
scheduler_config = SchedulerConfig({})
|
|
|
|
mock_quant_config = SimpleNamespace(
|
|
quant_round_type=0,
|
|
quant_max_bound=1.0,
|
|
quant_min_bound=-1.0,
|
|
)
|
|
mock_quant_config.get_quant_method = lambda self: None
|
|
|
|
fd_config = FDConfig(
|
|
model_config=model_config,
|
|
parallel_config=parallel_config,
|
|
scheduler_config=scheduler_config,
|
|
cache_config=CacheConfig({}),
|
|
graph_opt_config=GraphOptimizationConfig({}),
|
|
load_config=LoadConfig({}),
|
|
quant_config=mock_quant_config,
|
|
ips="0.0.0.0",
|
|
)
|
|
|
|
yield fd_config, tmp_dir
|
|
|
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_fd_config_qwen3():
|
|
"""Create a mock FDConfig with model_type=qwen3 for testing fusion settings."""
|
|
tmp_dir = tempfile.mkdtemp(prefix="test_base_qwen3_")
|
|
|
|
config_dict = {
|
|
"architectures": ["Qwen3ForCausalLM"],
|
|
"model_type": "qwen3",
|
|
"hidden_size": 4096,
|
|
"intermediate_size": 11008,
|
|
"num_hidden_layers": 2,
|
|
"num_attention_heads": 32,
|
|
"num_key_value_heads": 8,
|
|
"head_dim": 128,
|
|
"vocab_size": 32000,
|
|
"dtype": "float16",
|
|
"rms_norm_eps": 1e-6,
|
|
"rope_theta": 10000.0,
|
|
}
|
|
|
|
config_path = os.path.join(tmp_dir, "config.json")
|
|
with open(config_path, "w") as f:
|
|
json.dump(config_dict, f)
|
|
|
|
model_config = ModelConfig(
|
|
{
|
|
"model": tmp_dir,
|
|
"model_impl": "paddleformers",
|
|
"max_model_len": 2048,
|
|
}
|
|
)
|
|
|
|
parallel_config = ParallelConfig(
|
|
{
|
|
"tensor_parallel_size": 1, # TP=1 to enable fused QKV
|
|
"data_parallel_size": 1,
|
|
"expert_parallel_size": 1,
|
|
"tensor_parallel_rank": 0,
|
|
}
|
|
)
|
|
parallel_config.tp_group = None
|
|
|
|
scheduler_config = SchedulerConfig({})
|
|
|
|
mock_quant_config = SimpleNamespace(
|
|
quant_round_type=0,
|
|
quant_max_bound=1.0,
|
|
quant_min_bound=-1.0,
|
|
)
|
|
mock_quant_config.get_quant_method = lambda self: None
|
|
|
|
fd_config = FDConfig(
|
|
model_config=model_config,
|
|
parallel_config=parallel_config,
|
|
scheduler_config=scheduler_config,
|
|
cache_config=CacheConfig({}),
|
|
graph_opt_config=GraphOptimizationConfig({}),
|
|
load_config=LoadConfig({}),
|
|
quant_config=mock_quant_config,
|
|
ips="0.0.0.0",
|
|
)
|
|
|
|
yield fd_config, tmp_dir
|
|
|
|
shutil.rmtree(tmp_dir, ignore_errors=True)
|
|
|
|
|
|
class TestUtilityFunctions:
|
|
"""Test utility functions to cover lines 69-79."""
|
|
|
|
def test_getattr_iter(self):
|
|
"""Test getattr_iter with various scenarios."""
|
|
obj = SimpleNamespace(a=1, b=2, c=3)
|
|
|
|
# First match
|
|
assert getattr_iter(obj, ["b", "a"], default=None) == 2
|
|
|
|
# No match returns default
|
|
assert getattr_iter(obj, ["x", "y"], default=999) == 999
|
|
|
|
# Multiple names, find second match
|
|
assert getattr_iter(obj, ["x", "c"], default=None) == 3
|
|
|
|
def test_maybe_prefix(self):
|
|
"""Test maybe_prefix with various scenarios."""
|
|
# With prefix
|
|
assert maybe_prefix("model", "layers.0") == "model.layers.0"
|
|
|
|
# Empty prefix
|
|
assert maybe_prefix("", "layers.0") == "layers.0"
|
|
|
|
# None prefix
|
|
assert maybe_prefix(None, "layers.0") == "layers.0"
|
|
|
|
|
|
class TestRMSNormWrapper:
|
|
"""Test PaddleFormersRMSNormWrapper to cover lines 48-66."""
|
|
|
|
def test_wrapper_init_and_forward(self, mock_fd_config):
|
|
"""Test creating wrapper and forwarding."""
|
|
fd_config, _ = mock_fd_config
|
|
|
|
fd_rmsnorm = RMSNorm(
|
|
fd_config=fd_config,
|
|
hidden_size=768,
|
|
eps=1e-6,
|
|
prefix="test",
|
|
begin_norm_axis=-1,
|
|
)
|
|
|
|
wrapper = PaddleFormersRMSNormWrapper(fd_rmsnorm)
|
|
|
|
# Check initialization
|
|
assert wrapper._fd_rmsnorm is fd_rmsnorm
|
|
assert wrapper.weight is fd_rmsnorm.weight
|
|
|
|
# Test forward - FD RMSNorm returns (output, residual_out)
|
|
x = paddle.randn([10, 768])
|
|
result = wrapper.forward(x)
|
|
|
|
# Wrapper should return only the output tensor
|
|
assert isinstance(result, paddle.Tensor)
|
|
assert result.shape == [10, 768]
|
|
|
|
|
|
class TestAttentionForward:
|
|
"""Test fastdeploy_append_attention_forward to cover lines 82-163."""
|
|
|
|
def test_missing_required_attributes(self):
|
|
"""Test that missing required attributes raise ValueError."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
fastdeploy_append_attention_forward,
|
|
)
|
|
|
|
module = SimpleNamespace()
|
|
query = paddle.randn([1, 32, 10, 128])
|
|
key = paddle.randn([1, 32, 10, 128])
|
|
value = paddle.randn([1, 32, 10, 128])
|
|
attention_mask = paddle.ones([1, 10])
|
|
|
|
# Missing config
|
|
with pytest.raises(ValueError, match="does not have 'config' attribute"):
|
|
fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
# Missing attention_instances
|
|
module.config = SimpleNamespace()
|
|
with pytest.raises(ValueError, match="attention_instances not found"):
|
|
fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
# Missing forward_meta
|
|
module.config.attention_instances = {}
|
|
with pytest.raises(ValueError, match="forward_meta not found"):
|
|
fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
# Missing layer_idx
|
|
module.config.forward_meta = SimpleNamespace()
|
|
with pytest.raises(ValueError, match="layer_idx not found"):
|
|
fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
def test_valid_forward_call(self):
|
|
"""Test valid forward call with all required attributes."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
fastdeploy_append_attention_forward,
|
|
)
|
|
|
|
mock_attention = MagicMock()
|
|
mock_attention.num_heads = 32
|
|
mock_attention.num_key_value_heads = 32
|
|
mock_attention.forward = Mock(return_value=paddle.randn([10, 128 * 32]))
|
|
forward_meta = SimpleNamespace(rotary_embs=None)
|
|
|
|
module = SimpleNamespace(
|
|
config=SimpleNamespace(
|
|
attention_instances={0: mock_attention},
|
|
forward_meta=forward_meta,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=32,
|
|
),
|
|
layer_idx=0,
|
|
num_heads=32,
|
|
num_key_value_heads=32,
|
|
)
|
|
|
|
query = paddle.randn([1, 32, 10, 128])
|
|
key = paddle.randn([1, 32, 10, 128])
|
|
value = paddle.randn([1, 32, 10, 128])
|
|
attention_mask = paddle.ones([1, 10])
|
|
|
|
output, _ = fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
assert mock_attention.forward.called
|
|
|
|
def test_invalid_batch_size(self):
|
|
"""Test that batch size != 1 raises ValueError."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
fastdeploy_append_attention_forward,
|
|
)
|
|
|
|
mock_attention = MagicMock()
|
|
forward_meta = SimpleNamespace(rotary_embs=None)
|
|
|
|
module = SimpleNamespace(
|
|
config=SimpleNamespace(attention_instances={0: mock_attention}, forward_meta=forward_meta), layer_idx=0
|
|
)
|
|
|
|
query = paddle.randn([2, 32, 10, 128]) # Batch size 2
|
|
key = paddle.randn([2, 32, 10, 128])
|
|
value = paddle.randn([2, 32, 10, 128])
|
|
attention_mask = paddle.ones([2, 10])
|
|
|
|
with pytest.raises(ValueError, match="batch size.*not supported"):
|
|
fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
def test_scaling_parameter(self):
|
|
"""Test that scaling parameter sets attention scale."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
fastdeploy_append_attention_forward,
|
|
)
|
|
|
|
mock_attention = MagicMock()
|
|
mock_attention.num_heads = 32
|
|
mock_attention.num_key_value_heads = 32
|
|
mock_attention.forward = Mock(return_value=paddle.randn([10, 128 * 32]))
|
|
forward_meta = SimpleNamespace(rotary_embs=None)
|
|
|
|
module = SimpleNamespace(
|
|
config=SimpleNamespace(
|
|
attention_instances={0: mock_attention},
|
|
forward_meta=forward_meta,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=32,
|
|
),
|
|
layer_idx=0,
|
|
num_heads=32,
|
|
num_key_value_heads=32,
|
|
)
|
|
|
|
query = paddle.randn([1, 32, 10, 128])
|
|
key = paddle.randn([1, 32, 10, 128])
|
|
value = paddle.randn([1, 32, 10, 128])
|
|
attention_mask = paddle.ones([1, 10])
|
|
|
|
output, _ = fastdeploy_append_attention_forward(module, query, key, value, attention_mask, scaling=0.5)
|
|
|
|
assert mock_attention.scale == 0.5
|
|
|
|
|
|
class TestConfigSync:
|
|
"""Test _sync_config_from_text_config to cover lines 287-322."""
|
|
|
|
def test_sync_tie_word_embeddings(self, mock_fd_config):
|
|
"""Test syncing tie_word_embeddings from text_config."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
text_config = SimpleNamespace(
|
|
tie_word_embeddings=True,
|
|
hidden_size=4096,
|
|
)
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = text_config
|
|
|
|
model._sync_config_from_text_config()
|
|
|
|
assert model.model_config.tie_word_embeddings is True
|
|
|
|
def test_sync_multiple_fields(self, mock_fd_config):
|
|
"""Test syncing multiple fields from text_config."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
text_config = SimpleNamespace(
|
|
sliding_window=4096,
|
|
rope_theta=1000000.0,
|
|
rms_norm_eps=1e-5,
|
|
)
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = text_config
|
|
|
|
model._sync_config_from_text_config()
|
|
|
|
assert model.model_config.sliding_window == 4096
|
|
assert model.model_config.rope_theta == 1000000.0
|
|
assert model.model_config.rms_norm_eps == 1e-5
|
|
|
|
def test_skips_none_values(self, mock_fd_config):
|
|
"""Test that None values are not synced."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
fd_config.model_config.sliding_window = 2048
|
|
|
|
text_config = SimpleNamespace(
|
|
sliding_window=None,
|
|
rope_theta=10000.0,
|
|
)
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = text_config
|
|
|
|
model._sync_config_from_text_config()
|
|
|
|
# sliding_window should remain unchanged
|
|
assert model.model_config.sliding_window == 2048
|
|
assert model.model_config.rope_theta == 10000.0
|
|
|
|
|
|
class TestAttentionInstances:
|
|
"""Test create_attention_instances to cover lines 523-555."""
|
|
|
|
def test_creates_instances_for_all_layers(self, mock_fd_config):
|
|
"""Test that attention instances are created for all layers."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
mock_model = SimpleNamespace()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with (
|
|
patch("paddleformers.transformers.AutoModel", return_value=mock_model),
|
|
patch("paddleformers.transformers.AutoConfig"),
|
|
patch.object(Attention, "__init__", return_value=None),
|
|
):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
num_hidden_layers=4,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model
|
|
|
|
instances = model.create_attention_instances()
|
|
|
|
assert len(instances) == 4
|
|
assert all(isinstance(key, int) for key in instances.keys())
|
|
|
|
def test_sliding_window_sets_layer_types(self, mock_fd_config):
|
|
"""Test that sliding_window creates layer_types config."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
mock_model = SimpleNamespace()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with (
|
|
patch("paddleformers.transformers.AutoModel", return_value=mock_model),
|
|
patch("paddleformers.transformers.AutoConfig"),
|
|
patch.object(Attention, "__init__", return_value=None),
|
|
):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
num_hidden_layers=4,
|
|
vocab_size=32000,
|
|
sliding_window=4096,
|
|
sliding_window_pattern=2,
|
|
)
|
|
model.model = mock_model
|
|
|
|
_ = model.create_attention_instances()
|
|
|
|
assert hasattr(model.model_config, "layer_types")
|
|
assert len(model.model_config.layer_types) == 4
|
|
assert model.model_config.sliding_window == 4096
|
|
|
|
|
|
class TestEmbedInputIds:
|
|
"""Test embed_input_ids to cover lines 557-564."""
|
|
|
|
def test_basic_embedding(self, mock_fd_config):
|
|
"""Test basic embedding lookup."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
mock_embedding = Mock()
|
|
mock_embedding.return_value = paddle.randn([10, 4096])
|
|
|
|
mock_model = Mock()
|
|
mock_model.get_input_embeddings.return_value = mock_embedding
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with (
|
|
patch("paddleformers.transformers.AutoModel", return_value=mock_model),
|
|
patch("paddleformers.transformers.AutoConfig"),
|
|
):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model
|
|
model.embed_scale = None
|
|
|
|
input_ids = paddle.to_tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype="int64")
|
|
embeddings = model.embed_input_ids(input_ids)
|
|
|
|
assert embeddings.shape == [10, 4096]
|
|
|
|
def test_embedding_with_scale(self, mock_fd_config):
|
|
"""Test embedding with embed_scale."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
mock_embedding = Mock()
|
|
mock_embedding.return_value = paddle.randn([10, 4096]) * 0.5
|
|
|
|
mock_model = Mock()
|
|
mock_model.get_input_embeddings.return_value = mock_embedding
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with (
|
|
patch("paddleformers.transformers.AutoModel", return_value=mock_model),
|
|
patch("paddleformers.transformers.AutoConfig"),
|
|
):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model
|
|
model.embed_scale = 0.5
|
|
|
|
input_ids = paddle.to_tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype="int64")
|
|
embeddings = model.embed_input_ids(input_ids)
|
|
|
|
assert embeddings.shape == [10, 4096]
|
|
|
|
|
|
class TestRecursiveReplace:
|
|
"""Test recursive_replace to cover lines 308-393."""
|
|
|
|
def test_replaces_linear_layers(self, mock_fd_config):
|
|
"""Test that nn.Linear layers are replaced with FD parallel layers."""
|
|
from fastdeploy.model_executor.layers.linear import ReplicatedLinear
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create a mock model with all Linear layers that have TP patterns
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
# Colwise patterns
|
|
self.q_proj = nn.Linear(4096, 4096)
|
|
self.k_proj = nn.Linear(4096, 1024) # GQA style
|
|
self.v_proj = nn.Linear(4096, 1024) # GQA style
|
|
self.gate_proj = nn.Linear(4096, 11008)
|
|
self.up_proj = nn.Linear(4096, 11008)
|
|
# Rowwise patterns
|
|
self.o_proj = nn.Linear(4096, 4096)
|
|
self.down_proj = nn.Linear(11008, 4096)
|
|
# No pattern - replicated
|
|
self.other_linear = nn.Linear(100, 100)
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
# Manually add required attributes since we bypassed __init__
|
|
# MUST be set before assigning any sublayers
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = False
|
|
|
|
# Call recursive_replace
|
|
model.recursive_replace()
|
|
|
|
# Verify colwise layers were replaced with ColumnParallelLinear
|
|
assert isinstance(model.model.q_proj, ColumnParallelLinear)
|
|
assert isinstance(model.model.k_proj, ColumnParallelLinear)
|
|
assert isinstance(model.model.v_proj, ColumnParallelLinear)
|
|
assert isinstance(model.model.gate_proj, ColumnParallelLinear)
|
|
assert isinstance(model.model.up_proj, ColumnParallelLinear)
|
|
# Verify rowwise layers were replaced with RowParallelLinear
|
|
assert isinstance(model.model.o_proj, RowParallelLinear)
|
|
assert isinstance(model.model.down_proj, RowParallelLinear)
|
|
# Verify non-matching layers become ReplicatedLinear
|
|
assert isinstance(model.model.other_linear, ReplicatedLinear)
|
|
|
|
def test_replaces_rmsnorm_layers(self, mock_fd_config):
|
|
"""Test that RMSNorm layers are wrapped."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create a mock RMSNorm-like class
|
|
class MockRMSNorm(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__() # Must call super first
|
|
self.weight = paddle.create_parameter(
|
|
shape=[4096], dtype="float32", default_initializer=paddle.nn.initializer.Constant(value=1.0)
|
|
)
|
|
self.epsilon = 1e-6
|
|
|
|
# Create a mock model with RMSNorm
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__() # Must call super first
|
|
self.input_layernorm = MockRMSNorm()
|
|
self.post_attention_layernorm = MockRMSNorm()
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
# Manually add required attributes since we bypassed __init__
|
|
# MUST be set before assigning any sublayers
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
|
|
# Call recursive_replace
|
|
model.recursive_replace()
|
|
|
|
# Verify RMSNorm layers were wrapped
|
|
assert isinstance(model.model.input_layernorm, PaddleFormersRMSNormWrapper)
|
|
assert isinstance(model.model.post_attention_layernorm, PaddleFormersRMSNormWrapper)
|
|
|
|
def test_nested_module_replacement(self, mock_fd_config):
|
|
"""Test that nested modules are also processed."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create nested mock modules
|
|
class MockAttention(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__() # Must call super first
|
|
self.q_proj = nn.Linear(4096, 4096)
|
|
self.k_proj = nn.Linear(4096, 4096)
|
|
|
|
class MockLayer(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__() # Must call super first
|
|
self.attention = MockAttention()
|
|
self.mlp_down = nn.Linear(11008, 4096)
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__() # Must call super first
|
|
self.layers = nn.LayerList([MockLayer()])
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
# Manually add required attributes since we bypassed __init__
|
|
# MUST be set before assigning any sublayers
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
|
|
# Call recursive_replace
|
|
model.recursive_replace()
|
|
|
|
# Verify nested layers were also replaced
|
|
assert isinstance(model.model.layers[0].attention.q_proj, ColumnParallelLinear)
|
|
assert isinstance(model.model.layers[0].attention.k_proj, ColumnParallelLinear)
|
|
# mlp_down doesn't match any TP pattern, becomes ReplicatedLinear
|
|
from fastdeploy.model_executor.layers.linear import ReplicatedLinear
|
|
|
|
assert isinstance(model.model.layers[0].mlp_down, ReplicatedLinear)
|
|
|
|
|
|
class TestAttentionForwardEdgeCases:
|
|
"""Test fastdeploy_append_attention_forward with joint QKV layout strategy."""
|
|
|
|
@staticmethod
|
|
def _flatten_layout(t: paddle.Tensor, layout: str) -> paddle.Tensor:
|
|
"""按给定 layout 将 Q/K/V 拉平成 [S, H*D]。"""
|
|
t3 = t.squeeze(0) if t.ndim == 4 else t
|
|
if layout == "hsd":
|
|
return t3.transpose([1, 0, 2]).reshape([int(t3.shape[1]), -1])
|
|
if layout == "shd":
|
|
return t3.reshape([int(t3.shape[0]), -1])
|
|
raise ValueError(f"Unsupported layout: {layout}")
|
|
|
|
def _assert_qkv_concat_matches_known_layout(
|
|
self,
|
|
qkv: paddle.Tensor,
|
|
query: paddle.Tensor,
|
|
key: paddle.Tensor,
|
|
value: paddle.Tensor,
|
|
) -> None:
|
|
"""验证输出确实匹配已知 flatten 规则(新/旧实现)。"""
|
|
matched_layouts = []
|
|
|
|
for layout in ("shd", "hsd"):
|
|
q_flat = self._flatten_layout(query, layout)
|
|
k_flat = self._flatten_layout(key, layout)
|
|
v_flat = self._flatten_layout(value, layout)
|
|
|
|
q_seq, k_seq, v_seq = int(q_flat.shape[0]), int(k_flat.shape[0]), int(v_flat.shape[0])
|
|
if not (q_seq == k_seq == v_seq == int(qkv.shape[0])):
|
|
continue
|
|
|
|
q_width, k_width, v_width = int(q_flat.shape[1]), int(k_flat.shape[1]), int(v_flat.shape[1])
|
|
if q_width + k_width + v_width != int(qkv.shape[1]):
|
|
continue
|
|
|
|
q_part = qkv[:, :q_width]
|
|
k_part = qkv[:, q_width : q_width + k_width]
|
|
v_part = qkv[:, q_width + k_width :]
|
|
|
|
if (
|
|
bool(paddle.allclose(q_part, q_flat))
|
|
and bool(paddle.allclose(k_part, k_flat))
|
|
and bool(paddle.allclose(v_part, v_flat))
|
|
):
|
|
matched_layouts.append(layout)
|
|
|
|
# 兼容旧实现:以 query 的 seq_len 为基准对 K/V 做 fallback flatten。
|
|
def _legacy_flatten(t: paddle.Tensor, seq_len: int) -> paddle.Tensor:
|
|
if t.ndim == 3:
|
|
return t.reshape([int(t.shape[0]), -1])
|
|
t3 = t.squeeze(0)
|
|
dim1, dim2 = int(t3.shape[0]), int(t3.shape[1])
|
|
if dim2 == seq_len:
|
|
return t3.transpose([1, 0, 2]).reshape([seq_len, -1])
|
|
if dim1 == seq_len:
|
|
return t3.reshape([seq_len, -1])
|
|
return t3.transpose([1, 0, 2]).reshape([seq_len, -1])
|
|
|
|
legacy_seq = int(query.shape[-2]) if query.ndim == 4 else int(query.shape[0])
|
|
q_legacy = _legacy_flatten(query, legacy_seq)
|
|
k_legacy = _legacy_flatten(key, legacy_seq)
|
|
v_legacy = _legacy_flatten(value, legacy_seq)
|
|
if int(q_legacy.shape[0]) == int(k_legacy.shape[0]) == int(v_legacy.shape[0]) == int(qkv.shape[0]) and int(
|
|
q_legacy.shape[1]
|
|
) + int(k_legacy.shape[1]) + int(v_legacy.shape[1]) == int(qkv.shape[1]):
|
|
q_width = int(q_legacy.shape[1])
|
|
k_width = int(k_legacy.shape[1])
|
|
if (
|
|
bool(paddle.allclose(qkv[:, :q_width], q_legacy))
|
|
and bool(paddle.allclose(qkv[:, q_width : q_width + k_width], k_legacy))
|
|
and bool(paddle.allclose(qkv[:, q_width + k_width :], v_legacy))
|
|
):
|
|
matched_layouts.append("legacy_query_seq")
|
|
|
|
assert matched_layouts, (
|
|
"QKV output does not match known flatten rules (SHD/HSD/legacy_query_seq). "
|
|
f"qkv_shape={list(qkv.shape)}, query={list(query.shape)}, key={list(key.shape)}, value={list(value.shape)}"
|
|
)
|
|
|
|
@staticmethod
|
|
def _run_attention(
|
|
query: paddle.Tensor,
|
|
key: paddle.Tensor,
|
|
value: paddle.Tensor,
|
|
num_heads: int | None = None,
|
|
num_kv_heads: int | None = None,
|
|
expected_seq_len: int | None = None,
|
|
tp_size: int = 1,
|
|
):
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
fastdeploy_append_attention_forward,
|
|
)
|
|
|
|
captured = {}
|
|
|
|
def fake_forward(qkv, forward_meta):
|
|
captured["qkv"] = qkv
|
|
return paddle.zeros([qkv.shape[0], qkv.shape[1] // 3], dtype=qkv.dtype)
|
|
|
|
mock_attention = SimpleNamespace(
|
|
forward=Mock(side_effect=fake_forward),
|
|
)
|
|
mock_attention.fd_config = SimpleNamespace(
|
|
parallel_config=SimpleNamespace(tensor_parallel_size=tp_size),
|
|
)
|
|
if num_heads is not None:
|
|
mock_attention.num_heads = num_heads
|
|
if num_kv_heads is not None:
|
|
mock_attention.num_key_value_heads = num_kv_heads
|
|
|
|
forward_meta = SimpleNamespace(rotary_embs=None)
|
|
if expected_seq_len is not None:
|
|
forward_meta.ids_remove_padding = paddle.arange(expected_seq_len, dtype="int64")
|
|
|
|
config = SimpleNamespace(attention_instances={0: mock_attention}, forward_meta=forward_meta)
|
|
if num_heads is not None:
|
|
config.num_attention_heads = num_heads
|
|
if num_kv_heads is not None:
|
|
config.num_key_value_heads = num_kv_heads
|
|
config.kv_num_heads = num_kv_heads
|
|
|
|
module = SimpleNamespace(config=config, layer_idx=0)
|
|
if num_heads is not None:
|
|
module.num_heads = num_heads
|
|
if num_kv_heads is not None:
|
|
module.num_key_value_heads = num_kv_heads
|
|
module.kv_num_heads = num_kv_heads
|
|
|
|
mask_seq = expected_seq_len if expected_seq_len is not None else int(query.shape[-2])
|
|
attention_mask = paddle.ones([1, int(mask_seq)], dtype=query.dtype)
|
|
|
|
out, _ = fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
assert isinstance(out, paddle.Tensor)
|
|
return captured["qkv"]
|
|
|
|
def test_invalid_tensor_dims_raises_error(self):
|
|
"""Invalid dimensions (2D) should fail with tensor rank error."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
fastdeploy_append_attention_forward,
|
|
)
|
|
|
|
module = SimpleNamespace(
|
|
config=SimpleNamespace(
|
|
attention_instances={0: SimpleNamespace(forward=Mock(return_value=paddle.zeros([1, 1])))},
|
|
forward_meta=SimpleNamespace(rotary_embs=None),
|
|
num_attention_heads=2,
|
|
),
|
|
layer_idx=0,
|
|
)
|
|
query = paddle.randn([10, 128])
|
|
key = paddle.randn([10, 128])
|
|
value = paddle.randn([10, 128])
|
|
attention_mask = paddle.ones([1, 10])
|
|
|
|
with pytest.raises(ValueError, match="unexpected dims"):
|
|
fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
def test_bhsd_data_correctness(self):
|
|
"""BHSD [B,H,S,D] should be flattened as [S, H*D]."""
|
|
query = paddle.to_tensor(np.arange(24, dtype=np.float32).reshape([1, 2, 3, 4]))
|
|
key = paddle.to_tensor((np.arange(24, dtype=np.float32) + 100).reshape([1, 2, 3, 4]))
|
|
value = paddle.to_tensor((np.arange(24, dtype=np.float32) + 200).reshape([1, 2, 3, 4]))
|
|
|
|
qkv = self._run_attention(query, key, value, num_heads=2, num_kv_heads=2, expected_seq_len=3)
|
|
|
|
expected_q = query.squeeze(0).transpose([1, 0, 2]).reshape([3, -1])
|
|
expected_k = key.squeeze(0).transpose([1, 0, 2]).reshape([3, -1])
|
|
expected_v = value.squeeze(0).transpose([1, 0, 2]).reshape([3, -1])
|
|
|
|
q_width = expected_q.shape[1]
|
|
k_width = expected_k.shape[1]
|
|
assert paddle.allclose(qkv[:, :q_width], expected_q)
|
|
assert paddle.allclose(qkv[:, q_width : q_width + k_width], expected_k)
|
|
assert paddle.allclose(qkv[:, q_width + k_width :], expected_v)
|
|
|
|
def test_bshd_data_correctness(self):
|
|
"""BSHD [B,S,H,D] should be flattened as [S, H*D]."""
|
|
query = paddle.to_tensor(np.arange(24, dtype=np.float32).reshape([1, 3, 2, 4]))
|
|
key = paddle.to_tensor((np.arange(24, dtype=np.float32) + 100).reshape([1, 3, 2, 4]))
|
|
value = paddle.to_tensor((np.arange(24, dtype=np.float32) + 200).reshape([1, 3, 2, 4]))
|
|
|
|
qkv = self._run_attention(query, key, value, num_heads=2, num_kv_heads=2, expected_seq_len=3)
|
|
self._assert_qkv_concat_matches_known_layout(qkv, query, key, value)
|
|
|
|
def test_joint_layout_with_gqa(self):
|
|
"""Q uses num_heads while K/V use num_kv_heads, and layout is selected jointly."""
|
|
# BSHD tensors: Q heads=4, KV heads=2, seq=3, head_dim=2
|
|
query = paddle.to_tensor(np.arange(24, dtype=np.float32).reshape([1, 3, 4, 2]))
|
|
key = paddle.to_tensor((np.arange(12, dtype=np.float32) + 100).reshape([1, 3, 2, 2]))
|
|
value = paddle.to_tensor((np.arange(12, dtype=np.float32) + 200).reshape([1, 3, 2, 2]))
|
|
|
|
qkv = self._run_attention(query, key, value, num_heads=4, num_kv_heads=2, expected_seq_len=3)
|
|
self._assert_qkv_concat_matches_known_layout(qkv, query, key, value)
|
|
|
|
def test_joint_layout_with_tp_local_heads(self):
|
|
"""TP 场景下 local heads 也应被识别为合法布局。"""
|
|
# global: q=8, kv=4; local(TP=2): q=4, kv=2
|
|
query = paddle.to_tensor(np.arange(40, dtype=np.float32).reshape([1, 4, 5, 2]))
|
|
key = paddle.to_tensor((np.arange(20, dtype=np.float32) + 100).reshape([1, 2, 5, 2]))
|
|
value = paddle.to_tensor((np.arange(20, dtype=np.float32) + 200).reshape([1, 2, 5, 2]))
|
|
|
|
qkv = self._run_attention(query, key, value, num_heads=8, num_kv_heads=4, expected_seq_len=5, tp_size=2)
|
|
self._assert_qkv_concat_matches_known_layout(qkv, query, key, value)
|
|
|
|
def test_gqa_shd_layout_detection(self):
|
|
"""GQA with SHD layout: num_heads in dim1 should be detected as shd."""
|
|
# shape_3d=(5,3,2): if num_heads=3, num_kv_heads=3, then dim1=3 matches -> shd
|
|
query = paddle.to_tensor(np.arange(30, dtype=np.float32).reshape([1, 5, 3, 2]))
|
|
key = paddle.to_tensor((np.arange(30, dtype=np.float32) + 100).reshape([1, 5, 3, 2]))
|
|
value = paddle.to_tensor((np.arange(30, dtype=np.float32) + 200).reshape([1, 5, 3, 2]))
|
|
|
|
# num_heads=3 matches dim1, so it's SHD layout
|
|
qkv = self._run_attention(query, key, value, num_heads=3, num_kv_heads=3, expected_seq_len=5)
|
|
self._assert_qkv_concat_matches_known_layout(qkv, query, key, value)
|
|
|
|
def test_ambiguous_h_equals_s_defaults_to_hsd(self):
|
|
"""When both layouts are valid (S=H), default should be hsd (BHSD/HSD-style)."""
|
|
# Ambiguous shape [1,3,3,2]: both hsd/shd valid, policy defaults to hsd.
|
|
query = paddle.to_tensor(np.arange(18, dtype=np.float32).reshape([1, 3, 3, 2]))
|
|
key = paddle.to_tensor((np.arange(18, dtype=np.float32) + 100).reshape([1, 3, 3, 2]))
|
|
value = paddle.to_tensor((np.arange(18, dtype=np.float32) + 200).reshape([1, 3, 3, 2]))
|
|
|
|
qkv = self._run_attention(query, key, value, num_heads=3, num_kv_heads=3, expected_seq_len=3)
|
|
|
|
expected_q_hsd = query.squeeze(0).transpose([1, 0, 2]).reshape([3, -1])
|
|
expected_q_shd = query.squeeze(0).reshape([3, -1])
|
|
q_width = expected_q_hsd.shape[1]
|
|
|
|
assert paddle.allclose(qkv[:, :q_width], expected_q_hsd)
|
|
assert not paddle.allclose(qkv[:, :q_width], expected_q_shd)
|
|
|
|
def test_mismatched_layout_raises(self):
|
|
"""If Q/K/V shapes don't match expected heads/layout, raise error."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
fastdeploy_append_attention_forward,
|
|
)
|
|
|
|
mock_attention = SimpleNamespace(
|
|
num_heads=2,
|
|
num_key_value_heads=2,
|
|
forward=Mock(return_value=paddle.zeros([1, 1])),
|
|
)
|
|
module = SimpleNamespace(
|
|
config=SimpleNamespace(
|
|
attention_instances={0: mock_attention},
|
|
forward_meta=SimpleNamespace(),
|
|
num_attention_heads=2,
|
|
num_key_value_heads=2,
|
|
),
|
|
layer_idx=0,
|
|
)
|
|
|
|
# 构造明显不一致的 K/V 形状,确保无论新旧布局策略都会失败。
|
|
query = paddle.randn([1, 2, 3, 4])
|
|
key = paddle.randn([1, 4, 5, 4])
|
|
value = paddle.randn([1, 4, 5, 4])
|
|
attention_mask = paddle.ones([1, 3], dtype=query.dtype)
|
|
|
|
with pytest.raises(ValueError):
|
|
fastdeploy_append_attention_forward(module, query, key, value, attention_mask)
|
|
|
|
|
|
class TestRecursiveReplaceAdvanced:
|
|
"""Test recursive_replace advanced cases to cover more lines."""
|
|
|
|
def test_fused_qkv_replacement(self, mock_fd_config):
|
|
"""Test that qkv_proj with fused QKV uses PaddleFormersQKVParallelLinear."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
PaddleFormersQKVParallelLinear,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create a mock model with qkv_proj layer
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.qkv_proj = nn.Linear(4096, 4096 * 3)
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = True # Enable fused QKV
|
|
model._use_fused_ffn = False
|
|
|
|
model.recursive_replace()
|
|
|
|
# qkv_proj should become PaddleFormersQKVParallelLinear
|
|
assert isinstance(model.model.qkv_proj, PaddleFormersQKVParallelLinear)
|
|
|
|
def test_fused_ffn_replacement(self, mock_fd_config):
|
|
"""Test that up_gate_proj with fused FFN uses MergedColumnParallelLinear (lines 340-347)."""
|
|
from fastdeploy.model_executor.layers.linear import MergedColumnParallelLinear
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create a mock model with up_gate_proj layer
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.up_gate_proj = nn.Linear(4096, 11008 * 2)
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
# Override _get_tp_plan to include up_gate_proj as colwise
|
|
def _get_tp_plan(self):
|
|
return {
|
|
r"\.up_gate_proj$": "colwise",
|
|
}
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = True # Enable fused FFN
|
|
|
|
model.recursive_replace()
|
|
|
|
# up_gate_proj should become MergedColumnParallelLinear
|
|
assert isinstance(model.model.up_gate_proj, MergedColumnParallelLinear)
|
|
|
|
def test_rmsnorm_without_weight(self, mock_fd_config):
|
|
"""Test RMSNorm replacement when module has no weight attribute (line 378)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create a mock RMSNorm without weight attribute
|
|
class MockRMSNormNoWeight(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
# No weight attribute, only epsilon
|
|
self.epsilon = 1e-6
|
|
|
|
MockRMSNormNoWeight.__name__ = "MockRMSNorm" # Name ends with RMSNorm
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.input_layernorm = MockRMSNormNoWeight()
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096, # This will be used as fallback
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = False
|
|
|
|
model.recursive_replace()
|
|
|
|
# Should still be wrapped, using hidden_size from text_config
|
|
assert isinstance(model.model.input_layernorm, PaddleFormersRMSNormWrapper)
|
|
|
|
def test_linear_without_weight(self, mock_fd_config):
|
|
"""Test Linear replacement when module uses in_features/out_features (lines 321-322)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create a mock Linear that doesn't have weight attribute but has in/out_features
|
|
class MockLinearNoWeight(nn.Layer):
|
|
def __init__(self, in_features, out_features):
|
|
super().__init__()
|
|
self.in_features = in_features
|
|
self.out_features = out_features
|
|
# weight is None
|
|
self.weight = None
|
|
self.bias = None
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.q_proj = MockLinearNoWeight(4096, 4096)
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
# Need to register MockLinearNoWeight as an nn.Linear subclass for the isinstance check
|
|
with (
|
|
patch("paddleformers.transformers.AutoModel"),
|
|
patch("paddleformers.transformers.AutoConfig"),
|
|
patch.object(nn.Linear, "__subclasscheck__", return_value=True),
|
|
):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = False
|
|
|
|
# This tests the path where weight is None and in_features/out_features are used
|
|
# However, since isinstance check happens first and our mock isn't a real nn.Linear,
|
|
# the replacement won't trigger. This is expected behavior.
|
|
model.recursive_replace()
|
|
|
|
|
|
class TestGetTPPlan:
|
|
"""Test _get_tp_plan to cover lines 410-473."""
|
|
|
|
def test_get_tp_plan_with_paddleformers_mappings(self, mock_fd_config):
|
|
"""Test _get_tp_plan when model has _get_tensor_parallel_mappings (lines 410-471)."""
|
|
from functools import partial
|
|
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
# Create a mock function that simulates PaddleFormers TP mapping
|
|
def mock_split_fn(tensor, is_column=False):
|
|
return tensor
|
|
|
|
# Mock mappings returned by PaddleFormers
|
|
mock_mappings = {
|
|
"model.layers.0.self_attn.q_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.self_attn.k_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.self_attn.v_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.self_attn.o_proj.weight": partial(mock_split_fn, is_column=False),
|
|
"model.layers.0.mlp.gate_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.mlp.up_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.mlp.down_proj.weight": partial(mock_split_fn, is_column=False),
|
|
}
|
|
|
|
class MockModelClass:
|
|
@classmethod
|
|
def _get_tensor_parallel_mappings(cls, config, is_split=True):
|
|
return mock_mappings
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
mock_model_obj = MockModel()
|
|
# Override the class type
|
|
mock_model_obj.__class__ = MockModelClass
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = False
|
|
|
|
tp_plan = model._get_tp_plan()
|
|
|
|
# Should have patterns from the mappings
|
|
assert r"\.q_proj$" in tp_plan
|
|
assert r"\.k_proj$" in tp_plan
|
|
assert r"\.v_proj$" in tp_plan
|
|
assert tp_plan[r"\.q_proj$"] == "colwise"
|
|
|
|
def test_get_tp_plan_with_fused_qkv(self, mock_fd_config):
|
|
"""Test _get_tp_plan adjusts for fused QKV (lines 444-453)."""
|
|
from functools import partial
|
|
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
def mock_split_fn(tensor, is_column=False):
|
|
return tensor
|
|
|
|
mock_mappings = {
|
|
"model.layers.0.self_attn.q_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.self_attn.k_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.self_attn.v_proj.weight": partial(mock_split_fn, is_column=True),
|
|
}
|
|
|
|
class MockModelClass:
|
|
@classmethod
|
|
def _get_tensor_parallel_mappings(cls, config, is_split=True):
|
|
return mock_mappings
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
mock_model_obj = MockModel()
|
|
mock_model_obj.__class__ = MockModelClass
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = True # Enable fused QKV
|
|
model._use_fused_ffn = False
|
|
|
|
tp_plan = model._get_tp_plan()
|
|
|
|
# With fused QKV, should have qkv_proj instead of q/k/v_proj
|
|
assert r"\.qkv_proj$" in tp_plan
|
|
assert tp_plan[r"\.qkv_proj$"] == "colwise"
|
|
# q/k/v_proj should be removed
|
|
assert r"\.q_proj$" not in tp_plan
|
|
|
|
def test_get_tp_plan_with_fused_ffn(self, mock_fd_config):
|
|
"""Test _get_tp_plan adjusts for fused FFN (lines 458-460)."""
|
|
from functools import partial
|
|
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
def mock_split_fn(tensor, is_column=False):
|
|
return tensor
|
|
|
|
# Mock mappings with gate_proj and up_proj (before fusion)
|
|
mock_mappings = {
|
|
"model.layers.0.mlp.gate_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.mlp.up_proj.weight": partial(mock_split_fn, is_column=True),
|
|
"model.layers.0.mlp.down_proj.weight": partial(mock_split_fn, is_column=False),
|
|
}
|
|
|
|
class MockModelClass:
|
|
@classmethod
|
|
def _get_tensor_parallel_mappings(cls, config, is_split=True):
|
|
return mock_mappings
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
mock_model_obj = MockModel()
|
|
mock_model_obj.__class__ = MockModelClass
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = True # Enable fused FFN
|
|
|
|
tp_plan = model._get_tp_plan()
|
|
|
|
# With fused FFN, should have up_gate_proj instead of gate/up_proj
|
|
assert r"\.up_gate_proj$" in tp_plan
|
|
assert tp_plan[r"\.up_gate_proj$"] == "colwise"
|
|
# gate_proj and up_proj should be removed
|
|
assert r"\.gate_proj$" not in tp_plan
|
|
assert r"\.up_proj$" not in tp_plan
|
|
|
|
def test_get_tp_plan_fallback_on_exception(self, mock_fd_config):
|
|
"""Test _get_tp_plan falls back to default on exception (line 472-473)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class MockModelClass:
|
|
@classmethod
|
|
def _get_tensor_parallel_mappings(cls, config, is_split=True):
|
|
raise RuntimeError("Simulated error")
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
mock_model_obj = MockModel()
|
|
mock_model_obj.__class__ = MockModelClass
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = False
|
|
|
|
tp_plan = model._get_tp_plan()
|
|
|
|
# Should fall back to default plan
|
|
assert r"\.q_proj$" in tp_plan
|
|
assert r"\.down_proj$" in tp_plan
|
|
|
|
|
|
class TestFusionSettings:
|
|
"""Test __init__ fusion settings to cover lines 201-202, 206-207, 214-216."""
|
|
|
|
def test_tp_greater_than_1_keeps_fused_qkv_for_qwen(self, mock_fd_config_tp2):
|
|
"""Test that Qwen keeps fused QKV enabled under TP>1."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, tmp_dir = mock_fd_config_tp2
|
|
|
|
# Create a mock paddleformers config
|
|
mock_pf_config = SimpleNamespace(
|
|
model_type="qwen3",
|
|
fuse_rms_norm=False,
|
|
hidden_size=4096,
|
|
num_hidden_layers=2,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=8,
|
|
vocab_size=32000,
|
|
_attn_implementation=None,
|
|
)
|
|
|
|
mock_pf_model = MagicMock()
|
|
mock_pf_model.eval = Mock()
|
|
mock_embedding = MagicMock()
|
|
mock_pf_model.get_input_embeddings = Mock(return_value=mock_embedding)
|
|
mock_pf_model.set_input_embeddings = Mock()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
# Patch nn.Layer.__init__ to accept fd_config and be a no-op
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch("paddleformers.transformers.AutoConfig.from_pretrained", return_value=mock_pf_config),
|
|
patch("paddleformers.transformers.AutoModel.from_config", return_value=mock_pf_model),
|
|
patch.object(TestModel, "recursive_replace"),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
patch("fastdeploy.model_executor.models.paddleformers.base.VocabParallelEmbedding"),
|
|
):
|
|
|
|
model = TestModel(fd_config)
|
|
|
|
# With TP=2 and qwen model type, fused QKV stays enabled.
|
|
assert model._use_fused_qkv is True
|
|
assert mock_pf_config.fuse_attention_qkv is True
|
|
|
|
def test_qwen3_tp1_enables_fused_qkv_and_ffn(self, mock_fd_config_qwen3):
|
|
"""Test that Qwen3 with TP=1 enables fused QKV and FFN (lines 206-207, 214-216)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, tmp_dir = mock_fd_config_qwen3
|
|
|
|
# Create a mock paddleformers config
|
|
mock_pf_config = SimpleNamespace(
|
|
model_type="qwen3",
|
|
fuse_rms_norm=False,
|
|
fuse_attention_qkv=False,
|
|
fuse_attention_ffn=False,
|
|
fuse_swiglu=False,
|
|
hidden_size=4096,
|
|
num_hidden_layers=2,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=8,
|
|
vocab_size=32000,
|
|
_attn_implementation=None,
|
|
)
|
|
|
|
mock_pf_model = MagicMock()
|
|
mock_pf_model.eval = Mock()
|
|
mock_embedding = MagicMock()
|
|
mock_pf_model.get_input_embeddings = Mock(return_value=mock_embedding)
|
|
mock_pf_model.set_input_embeddings = Mock()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch("paddleformers.transformers.AutoConfig.from_pretrained", return_value=mock_pf_config),
|
|
patch("paddleformers.transformers.AutoModel.from_config", return_value=mock_pf_model),
|
|
patch.object(TestModel, "recursive_replace"),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
patch("fastdeploy.model_executor.models.paddleformers.base.VocabParallelEmbedding"),
|
|
):
|
|
|
|
model = TestModel(fd_config)
|
|
|
|
# With Qwen3 and TP=1, fused QKV and FFN should be enabled
|
|
assert model._use_fused_qkv is True
|
|
assert model._use_fused_ffn is True
|
|
# Config should also be updated
|
|
assert mock_pf_config.fuse_attention_qkv is True
|
|
assert mock_pf_config.fuse_attention_ffn is True
|
|
assert mock_pf_config.fuse_swiglu is True
|
|
|
|
def test_non_qwen_model_disables_fusion(self, mock_fd_config):
|
|
"""Test that non-Qwen model types disable fusion."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, tmp_dir = mock_fd_config
|
|
|
|
# Create a mock paddleformers config with non-qwen model type
|
|
mock_pf_config = SimpleNamespace(
|
|
model_type="llama", # Not in supported_fused_qkv_models
|
|
fuse_rms_norm=False,
|
|
hidden_size=4096,
|
|
num_hidden_layers=2,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=32,
|
|
vocab_size=32000,
|
|
_attn_implementation=None,
|
|
)
|
|
|
|
mock_pf_model = MagicMock()
|
|
mock_pf_model.eval = Mock()
|
|
mock_embedding = MagicMock()
|
|
mock_pf_model.get_input_embeddings = Mock(return_value=mock_embedding)
|
|
mock_pf_model.set_input_embeddings = Mock()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch("paddleformers.transformers.AutoConfig.from_pretrained", return_value=mock_pf_config),
|
|
patch("paddleformers.transformers.AutoModel.from_config", return_value=mock_pf_model),
|
|
patch.object(TestModel, "recursive_replace"),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
patch("fastdeploy.model_executor.models.paddleformers.base.VocabParallelEmbedding"),
|
|
):
|
|
|
|
model = TestModel(fd_config)
|
|
|
|
# With llama model type, fusion should be disabled
|
|
assert model._use_fused_qkv is False
|
|
assert model._use_fused_ffn is False
|
|
|
|
|
|
class TestForward:
|
|
"""Test forward() edge cases to cover lines 564, 567-569, 574."""
|
|
|
|
def test_forward_without_batch_id_per_token(self, mock_fd_config):
|
|
"""Test forward() when batch_id_per_token is None (lines 567-569)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
mock_model_output = paddle.randn([1, 10, 4096])
|
|
|
|
mock_pf_model = MagicMock()
|
|
mock_pf_model.return_value = (mock_model_output,)
|
|
mock_pf_model.eval = Mock()
|
|
mock_embedding_layer = Mock(return_value=paddle.randn([10, 4096]))
|
|
mock_pf_model.get_input_embeddings = Mock(return_value=mock_embedding_layer)
|
|
|
|
mock_pf_config = SimpleNamespace(
|
|
model_type="llama",
|
|
hidden_size=4096,
|
|
num_hidden_layers=2,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=32,
|
|
vocab_size=32000,
|
|
fuse_rms_norm=False,
|
|
_attn_implementation=None,
|
|
forward_meta=None,
|
|
attention_instances=None,
|
|
)
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_pf_model
|
|
model.paddleformers_config = mock_pf_config
|
|
|
|
# Create forward_meta with batch_id_per_token = None (triggers lines 567-569)
|
|
forward_meta = SimpleNamespace(
|
|
batch_id_per_token=None,
|
|
seq_lens_decoder=paddle.to_tensor([[5]], dtype="int64"),
|
|
cu_seqlens_q=None,
|
|
)
|
|
|
|
input_ids = paddle.to_tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype="int64")
|
|
|
|
hidden_states = model.forward(input_ids, forward_meta)
|
|
|
|
assert hidden_states.shape == [10, 4096]
|
|
|
|
def test_forward_with_cu_seqlens_none(self, mock_fd_config):
|
|
"""Test forward() when cu_seqlens is None but batch_id_per_token exists (line 564)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
mock_model_output = paddle.randn([1, 10, 4096])
|
|
|
|
mock_pf_model = MagicMock()
|
|
mock_pf_model.return_value = (mock_model_output,)
|
|
mock_pf_model.eval = Mock()
|
|
mock_embedding_layer = Mock(return_value=paddle.randn([10, 4096]))
|
|
mock_pf_model.get_input_embeddings = Mock(return_value=mock_embedding_layer)
|
|
|
|
mock_pf_config = SimpleNamespace(
|
|
model_type="llama",
|
|
hidden_size=4096,
|
|
num_hidden_layers=2,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=32,
|
|
vocab_size=32000,
|
|
fuse_rms_norm=False,
|
|
_attn_implementation=None,
|
|
forward_meta=None,
|
|
attention_instances=None,
|
|
)
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
)
|
|
model.model = mock_pf_model
|
|
model.paddleformers_config = mock_pf_config
|
|
|
|
# Create forward_meta with cu_seqlens_q = None (triggers line 564)
|
|
forward_meta = SimpleNamespace(
|
|
batch_id_per_token=paddle.to_tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype="int64"),
|
|
seq_lens_decoder=paddle.to_tensor([[5]], dtype="int64"),
|
|
cu_seqlens_q=None, # This triggers line 564
|
|
)
|
|
|
|
input_ids = paddle.to_tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype="int64")
|
|
|
|
hidden_states = model.forward(input_ids, forward_meta)
|
|
|
|
assert hidden_states.shape == [10, 4096]
|
|
|
|
def test_forward_with_mrope(self, mock_fd_config):
|
|
"""Test forward() with uses_mrope=True (line 574)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
mock_model_output = paddle.randn([1, 10, 4096])
|
|
|
|
mock_pf_model = MagicMock()
|
|
mock_pf_model.return_value = (mock_model_output,)
|
|
mock_pf_model.eval = Mock()
|
|
mock_embedding_layer = Mock(return_value=paddle.randn([10, 4096]))
|
|
mock_pf_model.get_input_embeddings = Mock(return_value=mock_embedding_layer)
|
|
|
|
mock_pf_config = SimpleNamespace(
|
|
model_type="llama",
|
|
hidden_size=4096,
|
|
num_hidden_layers=2,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=32,
|
|
vocab_size=32000,
|
|
fuse_rms_norm=False,
|
|
_attn_implementation=None,
|
|
forward_meta=None,
|
|
attention_instances=None,
|
|
)
|
|
|
|
with patch("paddleformers.transformers.AutoModel"), patch("paddleformers.transformers.AutoConfig"):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(
|
|
hidden_size=4096,
|
|
vocab_size=32000,
|
|
uses_mrope=True, # This triggers line 574
|
|
)
|
|
model.model = mock_pf_model
|
|
model.paddleformers_config = mock_pf_config
|
|
|
|
# Create forward_meta without batch_id_per_token
|
|
forward_meta = SimpleNamespace(
|
|
batch_id_per_token=None,
|
|
seq_lens_decoder=None,
|
|
cu_seqlens_q=None,
|
|
)
|
|
|
|
input_ids = paddle.to_tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype="int64")
|
|
|
|
hidden_states = model.forward(input_ids, forward_meta)
|
|
|
|
assert hidden_states.shape == [10, 4096]
|
|
|
|
|
|
class TestLoadWeights:
|
|
"""Test load_weights to cover lines 619-800."""
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def setup_mocks(self):
|
|
"""Setup common mocks for all tests in this class."""
|
|
self.mock_model_output = (paddle.randn([1, 10, 4096]),)
|
|
|
|
# Mock PF model
|
|
self.mock_pf_model = MagicMock()
|
|
self.mock_pf_model.return_value = self.mock_model_output
|
|
self.mock_pf_model.eval = Mock()
|
|
self.mock_pf_model.named_parameters = Mock(return_value=[])
|
|
self.mock_pf_model.named_sublayers = Mock(return_value=[])
|
|
|
|
# Mock AutoModel.from_config to return our mock model
|
|
self.auto_model_patcher = patch(
|
|
"paddleformers.transformers.AutoModel.from_config", return_value=self.mock_pf_model
|
|
)
|
|
self.mock_auto_model = self.auto_model_patcher.start()
|
|
|
|
# Mock AutoConfig
|
|
self.auto_config_patcher = patch("paddleformers.transformers.AutoConfig")
|
|
self.mock_auto_config = self.auto_config_patcher.start()
|
|
|
|
# Configure from_pretrained return value properly
|
|
mock_config_instance = MagicMock()
|
|
mock_config_instance.hidden_size = 4096
|
|
mock_config_instance.num_attention_heads = 32
|
|
mock_config_instance.num_key_value_heads = 32
|
|
mock_config_instance.head_dim = 128
|
|
self.mock_auto_config.from_pretrained.return_value = mock_config_instance
|
|
|
|
# Also set on return_value if instantiated directly (just in case)
|
|
self.mock_auto_config.return_value = mock_config_instance
|
|
|
|
# Mock VocabParallelEmbedding
|
|
self.vocab_embed_patcher = patch("fastdeploy.model_executor.models.paddleformers.base.VocabParallelEmbedding")
|
|
self.mock_vocab_embed = self.vocab_embed_patcher.start()
|
|
|
|
# Mock process_weights_after_loading (correct path)
|
|
self.process_weights_patcher = patch("fastdeploy.model_executor.utils.process_weights_after_loading")
|
|
self.mock_process_weights = self.process_weights_patcher.start()
|
|
|
|
def teardown_method(self):
|
|
self.auto_model_patcher.stop()
|
|
self.auto_config_patcher.stop()
|
|
self.vocab_embed_patcher.stop()
|
|
self.process_weights_patcher.stop()
|
|
|
|
def test_load_fused_qkv_weights(self, mock_fd_config):
|
|
"""Test split q/k/v shards are routed to qkv_proj with shard ids."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
# Ensure config supports QKV fusion shapes (TP=1, equal heads)
|
|
fd_config.model_config.num_key_value_heads = 32
|
|
fd_config.model_config.num_attention_heads = 32
|
|
fd_config.model_config.hidden_size = 4096
|
|
fd_config.model_config.head_dim = 128
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
# Mock mock_layer_init to avoid real nn.Layer init issues
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
|
|
# Setup Model
|
|
model = TestModel(fd_config)
|
|
model.fd_config = fd_config
|
|
model._use_fused_qkv = True
|
|
model._use_fused_ffn = False
|
|
|
|
# Setup weights fusion buffer for QKV
|
|
model.qkv_stacked_mapping = {}
|
|
model.qkv_weight_buffer = {}
|
|
|
|
# Create mock parameters in the model
|
|
# We expect 'model.layers.0.self_attn.qkv_proj.weight' to exist
|
|
qkv_param = MagicMock(spec=paddle.Tensor)
|
|
qkv_param.shape = [4096, 12288] # [In, Out] for FD fused
|
|
qkv_param.weight_loader = Mock()
|
|
|
|
# Param dict needs to look like what named_parameters returns
|
|
params_dict = {"model.layers.0.self_attn.qkv_proj.weight": qkv_param}
|
|
|
|
# Mock named_parameters and named_sublayers
|
|
model.named_parameters = Mock(return_value=params_dict.items())
|
|
model.named_sublayers = Mock(return_value={}.items())
|
|
|
|
# Prepare weights to load
|
|
q_weight = paddle.randn([4096, 4096])
|
|
k_weight = paddle.randn([4096, 4096])
|
|
v_weight = paddle.randn([4096, 4096])
|
|
|
|
weights = [
|
|
("model.layers.0.self_attn.q_proj.weight", q_weight),
|
|
("model.layers.0.self_attn.k_proj.weight", k_weight),
|
|
# Provide V last to trigger fusion
|
|
("model.layers.0.self_attn.v_proj.weight", v_weight),
|
|
]
|
|
|
|
# Run load_weights
|
|
model.load_weights(weights)
|
|
|
|
# Verification: split shards are forwarded via shard_id.
|
|
assert qkv_param.weight_loader.called
|
|
calls = qkv_param.weight_loader.call_args_list
|
|
assert len(calls) == 3
|
|
assert [c.args[2] for c in calls] == ["q", "k", "v"]
|
|
assert list(calls[0].args[1].shape) == [4096, 4096]
|
|
assert list(calls[1].args[1].shape) == [4096, 4096]
|
|
assert list(calls[2].args[1].shape) == [4096, 4096]
|
|
|
|
def test_load_fused_qkv_weights_torch_writeback_shape(self, mock_fd_config):
|
|
"""Torch model_format should route split q/k/v shards without in-test fusion."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
fd_config.model_config.model_format = "torch"
|
|
fd_config.model_config.num_key_value_heads = 8
|
|
fd_config.model_config.num_attention_heads = 32
|
|
fd_config.model_config.hidden_size = 4096
|
|
fd_config.model_config.head_dim = 128
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
model = TestModel(fd_config)
|
|
model.fd_config = fd_config
|
|
model._use_fused_qkv = True
|
|
model._use_fused_ffn = False
|
|
|
|
qkv_param = MagicMock(spec=paddle.Tensor)
|
|
# torch storage layout: [out, in]
|
|
qkv_param.shape = [6144, 4096]
|
|
qkv_param.weight_loader = Mock()
|
|
|
|
params_dict = {"model.layers.0.self_attn.qkv_proj.weight": qkv_param}
|
|
model.named_parameters = Mock(return_value=params_dict.items())
|
|
model.named_sublayers = Mock(return_value={}.items())
|
|
|
|
q_weight = paddle.randn([4096, 4096]) # torch source layout [out, in] (square here)
|
|
k_weight = paddle.randn([1024, 4096]) # torch source layout [out, in]
|
|
v_weight = paddle.randn([1024, 4096]) # torch source layout [out, in]
|
|
weights = [
|
|
("model.layers.0.self_attn.q_proj.weight", q_weight),
|
|
("model.layers.0.self_attn.k_proj.weight", k_weight),
|
|
("model.layers.0.self_attn.v_proj.weight", v_weight),
|
|
]
|
|
|
|
model.load_weights(weights)
|
|
|
|
assert qkv_param.weight_loader.called
|
|
calls = qkv_param.weight_loader.call_args_list
|
|
assert len(calls) == 3
|
|
assert [c.args[2] for c in calls] == ["q", "k", "v"]
|
|
assert list(calls[0].args[1].shape) == [4096, 4096]
|
|
assert list(calls[1].args[1].shape) == [1024, 4096]
|
|
assert list(calls[2].args[1].shape) == [1024, 4096]
|
|
|
|
def test_load_fused_qkv_weights_torch_accepts_mismatched_source_shapes(self, mock_fd_config):
|
|
"""Split q/k/v routing remains shape-agnostic at this unit-test layer."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
fd_config.model_config.model_format = "torch"
|
|
fd_config.model_config.num_key_value_heads = 8
|
|
fd_config.model_config.num_attention_heads = 32
|
|
fd_config.model_config.hidden_size = 4096
|
|
fd_config.model_config.head_dim = 128
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
model = TestModel(fd_config)
|
|
model.fd_config = fd_config
|
|
model._use_fused_qkv = True
|
|
model._use_fused_ffn = False
|
|
|
|
class DummyParam:
|
|
def __init__(self, shape):
|
|
self.shape = shape
|
|
self.weight_loader = Mock()
|
|
|
|
qkv_param = DummyParam([6144, 4096])
|
|
|
|
params_dict = {"model.layers.0.self_attn.qkv_proj.weight": qkv_param}
|
|
model.named_parameters = Mock(return_value=params_dict.items())
|
|
model.named_sublayers = Mock(return_value={}.items())
|
|
|
|
# Deliberately provide paddle-layout K/V under torch strict policy.
|
|
q_weight = paddle.randn([4096, 4096])
|
|
k_weight = paddle.randn([4096, 1024])
|
|
v_weight = paddle.randn([4096, 1024])
|
|
weights = [
|
|
("model.layers.0.self_attn.q_proj.weight", q_weight),
|
|
("model.layers.0.self_attn.k_proj.weight", k_weight),
|
|
("model.layers.0.self_attn.v_proj.weight", v_weight),
|
|
]
|
|
|
|
model.load_weights(weights)
|
|
calls = qkv_param.weight_loader.call_args_list
|
|
assert len(calls) == 3
|
|
assert [c.args[2] for c in calls] == ["q", "k", "v"]
|
|
assert list(calls[0].args[1].shape) == [4096, 4096]
|
|
assert list(calls[1].args[1].shape) == [4096, 1024]
|
|
assert list(calls[2].args[1].shape) == [4096, 1024]
|
|
|
|
def test_load_fused_qkv_weights_split_path_ignores_model_format(self, mock_fd_config):
|
|
"""Split q/k/v routing should not depend on model_format value."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
fd_config.model_config.model_format = "onnx"
|
|
fd_config.model_config.num_key_value_heads = 8
|
|
fd_config.model_config.num_attention_heads = 32
|
|
fd_config.model_config.hidden_size = 4096
|
|
fd_config.model_config.head_dim = 128
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
model = TestModel(fd_config)
|
|
model.fd_config = fd_config
|
|
model._use_fused_qkv = True
|
|
model._use_fused_ffn = False
|
|
|
|
class DummyParam:
|
|
def __init__(self, shape):
|
|
self.shape = shape
|
|
self.weight_loader = Mock()
|
|
|
|
qkv_param = DummyParam([6144, 4096])
|
|
|
|
params_dict = {"model.layers.0.self_attn.qkv_proj.weight": qkv_param}
|
|
model.named_parameters = Mock(return_value=params_dict.items())
|
|
model.named_sublayers = Mock(return_value={}.items())
|
|
|
|
# Use canonical paddle layout inputs; error should come from unsupported model_format itself.
|
|
q_weight = paddle.randn([4096, 4096])
|
|
k_weight = paddle.randn([4096, 1024])
|
|
v_weight = paddle.randn([4096, 1024])
|
|
weights = [
|
|
("model.layers.0.self_attn.q_proj.weight", q_weight),
|
|
("model.layers.0.self_attn.k_proj.weight", k_weight),
|
|
("model.layers.0.self_attn.v_proj.weight", v_weight),
|
|
]
|
|
|
|
model.load_weights(weights)
|
|
calls = qkv_param.weight_loader.call_args_list
|
|
assert len(calls) == 3
|
|
assert [c.args[2] for c in calls] == ["q", "k", "v"]
|
|
|
|
def test_load_fused_qkv_biases(self, mock_fd_config):
|
|
"""QKV bias shards should be routed to qkv_proj.bias with shard ids."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
fd_config.model_config.model_format = "paddle"
|
|
fd_config.model_config.num_key_value_heads = 8
|
|
fd_config.model_config.num_attention_heads = 32
|
|
fd_config.model_config.hidden_size = 4096
|
|
fd_config.model_config.head_dim = 128
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
model = TestModel(fd_config)
|
|
model.fd_config = fd_config
|
|
model._use_fused_qkv = True
|
|
model._use_fused_ffn = False
|
|
|
|
class DummyParam:
|
|
def __init__(self, shape):
|
|
self.shape = shape
|
|
self.weight_loader = Mock()
|
|
|
|
qkv_bias_param = DummyParam([6144])
|
|
|
|
params_dict = {"model.layers.0.self_attn.qkv_proj.bias": qkv_bias_param}
|
|
model.named_parameters = Mock(return_value=params_dict.items())
|
|
model.named_sublayers = Mock(return_value={}.items())
|
|
|
|
q_bias = paddle.randn([4096])
|
|
k_bias = paddle.randn([1024])
|
|
v_bias = paddle.randn([1024])
|
|
weights = [
|
|
("model.layers.0.self_attn.q_proj.bias", q_bias),
|
|
("model.layers.0.self_attn.k_proj.bias", k_bias),
|
|
("model.layers.0.self_attn.v_proj.bias", v_bias),
|
|
]
|
|
|
|
model.load_weights(weights)
|
|
assert qkv_bias_param.weight_loader.called
|
|
calls = qkv_bias_param.weight_loader.call_args_list
|
|
assert len(calls) == 3
|
|
assert [c.args[2] for c in calls] == ["q", "k", "v"]
|
|
assert list(calls[0].args[1].shape) == [4096]
|
|
assert list(calls[1].args[1].shape) == [1024]
|
|
assert list(calls[2].args[1].shape) == [1024]
|
|
|
|
def test_load_fused_ffn_weights(self, mock_fd_config):
|
|
"""Test loading and fusing FFN weights (lines 619-624 + stacked mapping logic)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
|
|
model = TestModel(fd_config)
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = True
|
|
model.qkv_stacked_mapping = {}
|
|
model.qkv_weight_buffer = {}
|
|
# stacked_params_mapping is hardcoded in base.py/load_weights, so we rely on that.
|
|
# It maps gate_proj/up_proj (loaded) to up_gate_proj (model param).
|
|
|
|
up_gate_param = MagicMock(spec=paddle.Tensor)
|
|
up_gate_param.weight_loader = Mock()
|
|
|
|
params_dict = {
|
|
"model.layers.0.mlp.up_gate_proj.weight": up_gate_param,
|
|
}
|
|
model.named_parameters = Mock(return_value=params_dict.items())
|
|
model.named_sublayers = Mock(return_value={}.items())
|
|
|
|
# Simulate loading separate gate and up weights from checkpoint
|
|
loaded_gate = paddle.randn([4096, 11008]) # Example shapes
|
|
loaded_up = paddle.randn([4096, 11008])
|
|
|
|
weights = [
|
|
("model.layers.0.mlp.gate_proj.weight", loaded_gate),
|
|
("model.layers.0.mlp.up_proj.weight", loaded_up),
|
|
]
|
|
|
|
model.load_weights(weights)
|
|
|
|
# Expect weight_loader to be called for both input weights, fusing them into the param
|
|
# Wait, default `weight_loader` might not fuse?
|
|
# Actually `weight_loader` just loads.
|
|
# But the mapping logic in base.py redirects `gate_proj` -> `up_gate_proj` and `up_proj` -> `up_gate_proj`.
|
|
# And calls `up_gate_param.weight_loader`.
|
|
# So `up_gate_param.weight_loader` should be called twice.
|
|
|
|
assert up_gate_param.weight_loader.call_count == 2
|
|
|
|
def test_tie_word_embeddings(self, mock_fd_config):
|
|
"""Test tie_word_embeddings logic (lines 794-800)."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
|
|
model = TestModel(fd_config)
|
|
model.tie_word_embeddings = True
|
|
model.lm_head = MagicMock()
|
|
model.lm_head.linear.weight.set_value = Mock()
|
|
model.qkv_stacked_mapping = {}
|
|
model.qkv_weight_buffer = {}
|
|
|
|
# Mock embeddings
|
|
mock_emb_layer = MagicMock()
|
|
mock_emb_layer.embeddings.weight = paddle.randn([32000, 4096])
|
|
model.model = MagicMock()
|
|
model.model.get_input_embeddings.return_value = mock_emb_layer
|
|
|
|
# Call load_weights with empty weights
|
|
model.named_parameters = Mock(return_value=[])
|
|
model.named_sublayers = Mock(return_value=[])
|
|
|
|
model.load_weights([])
|
|
|
|
# Verify set_value called on lm_head
|
|
assert model.lm_head.linear.weight.set_value.called
|
|
|
|
def test_load_weights_qkv_direct_is_skipped_when_split_exists(self, mock_fd_config):
|
|
"""When split q/k/v exists, direct qkv_proj.* should be skipped for that layer."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
model = TestModel(fd_config)
|
|
model._use_fused_qkv = True
|
|
model._use_fused_ffn = False
|
|
|
|
qkv_param = MagicMock(spec=paddle.Tensor)
|
|
qkv_param.weight_loader = Mock()
|
|
params_dict = {"model.layers.0.self_attn.qkv_proj.weight": qkv_param}
|
|
model.named_parameters = Mock(return_value=params_dict.items())
|
|
model.named_sublayers = Mock(return_value={}.items())
|
|
|
|
weights = [
|
|
("model.layers.0.self_attn.q_proj.weight", paddle.randn([4096, 4096])),
|
|
("model.layers.0.self_attn.k_proj.weight", paddle.randn([4096, 4096])),
|
|
("model.layers.0.self_attn.v_proj.weight", paddle.randn([4096, 4096])),
|
|
("model.layers.0.self_attn.qkv_proj.weight", paddle.randn([4096, 12288])),
|
|
]
|
|
model.load_weights(weights)
|
|
|
|
# Only split q/k/v shards should be loaded for this layer.
|
|
assert qkv_param.weight_loader.call_count == 3
|
|
assert [c.args[2] for c in qkv_param.weight_loader.call_args_list] == ["q", "k", "v"]
|
|
|
|
def test_load_weights_direct_qkv_not_found_and_tie_warning(self, mock_fd_config):
|
|
"""Cover direct qkv not-found warning and tie_word_embeddings warning path."""
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
def mock_layer_init(self, *args, **kwargs):
|
|
self._sub_layers = {}
|
|
self._parameters = {}
|
|
self._buffers = {}
|
|
self._loaddict_holder = {}
|
|
|
|
with (
|
|
patch.object(nn.Layer, "__init__", mock_layer_init),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
patch("fastdeploy.model_executor.models.paddleformers.base.logger.warning") as mock_warning,
|
|
):
|
|
model = TestModel(fd_config)
|
|
model._use_fused_qkv = True
|
|
model._use_fused_ffn = False
|
|
model.tie_word_embeddings = True
|
|
model.lm_head = MagicMock()
|
|
model.lm_head.linear.weight.set_value = Mock()
|
|
|
|
model.model = MagicMock()
|
|
# Missing embeddings.weight to hit warning branch.
|
|
model.model.get_input_embeddings.return_value = SimpleNamespace()
|
|
|
|
model.named_parameters = Mock(return_value=[].__iter__())
|
|
model.named_sublayers = Mock(return_value=[].__iter__())
|
|
|
|
weights = [
|
|
("model.layers.0.self_attn.qkv_proj.weight", paddle.randn([4096, 12288])),
|
|
]
|
|
|
|
model.load_weights(weights)
|
|
|
|
warning_texts = [str(c.args[0]) for c in mock_warning.call_args_list if c.args]
|
|
assert any("Direct fused qkv param not found" in msg for msg in warning_texts)
|
|
assert any("tie_word_embeddings=True" in msg for msg in warning_texts)
|
|
assert not model.lm_head.linear.weight.set_value.called
|
|
|
|
|
|
class TestLinearNoWeight:
|
|
"""Test Linear layer replacement when weight is None (lines 321-322)."""
|
|
|
|
def test_linear_no_weight_attrs(self, mock_fd_config):
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersModelBase,
|
|
)
|
|
|
|
fd_config, _ = mock_fd_config
|
|
|
|
class MockLinear(nn.Linear):
|
|
def __init__(self):
|
|
# Init with dummy args
|
|
super().__init__(10, 10)
|
|
# Force weight to None to trigger correct branch
|
|
self.weight = None
|
|
self.bias = None
|
|
self.in_features = 4096
|
|
self.out_features = 4096
|
|
|
|
class MockModel(nn.Layer):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.q_proj = MockLinear() # Targets colwise
|
|
|
|
mock_model_obj = MockModel()
|
|
|
|
class TestModel(PaddleFormersModelBase):
|
|
pass
|
|
|
|
with (
|
|
patch("paddleformers.transformers.AutoModel"),
|
|
patch("paddleformers.transformers.AutoConfig"),
|
|
patch.object(TestModel, "create_attention_instances", return_value={}),
|
|
):
|
|
|
|
model = object.__new__(TestModel)
|
|
model.__dict__["_sub_layers"] = {}
|
|
model.__dict__["_parameters"] = {}
|
|
model.__dict__["_buffers"] = {}
|
|
model.__dict__["_loaddict_holder"] = {}
|
|
model.fd_config = fd_config
|
|
model.model_config = fd_config.model_config
|
|
model.text_config = SimpleNamespace(hidden_size=4096)
|
|
model.model = mock_model_obj
|
|
model._use_fused_qkv = False
|
|
model._use_fused_ffn = False
|
|
|
|
model.recursive_replace()
|
|
|
|
# q_proj should be replaced
|
|
from fastdeploy.model_executor.layers.linear import ColumnParallelLinear
|
|
|
|
assert isinstance(model.model.q_proj, ColumnParallelLinear)
|
|
|
|
|
|
class TestPaddleFormersQKVParallelLinearUnit:
|
|
"""Unit tests for PaddleFormersQKVParallelLinear helper methods."""
|
|
|
|
@staticmethod
|
|
def _build_layer(model_format: str = "paddle"):
|
|
from fastdeploy.model_executor.models.paddleformers.base import (
|
|
PaddleFormersQKVParallelLinear,
|
|
)
|
|
|
|
layer = object.__new__(PaddleFormersQKVParallelLinear)
|
|
layer._pending_local_shards = {}
|
|
layer._model_format = model_format
|
|
layer.tp_size = 1
|
|
layer.local_rank = 0
|
|
layer.num_heads = 4
|
|
layer.kv_num_heads = 2
|
|
layer.num_heads_per_rank = 4
|
|
layer.kv_num_heads_per_rank = 2
|
|
layer.num_kv_head_replicas = 1
|
|
layer.head_dim = 2
|
|
layer.fd_config = SimpleNamespace(load_config=SimpleNamespace(is_pre_sharded=False))
|
|
return layer
|
|
|
|
def test_extract_local_shard_with_transpose_and_tp_slice(self):
|
|
layer = self._build_layer()
|
|
layer.tp_size = 2
|
|
layer.local_rank = 1
|
|
layer.num_heads_per_rank = 2
|
|
layer.kv_num_heads_per_rank = 1
|
|
layer.head_dim = 2
|
|
|
|
param = SimpleNamespace(output_dim=True, shape=[4, 8], weight_need_transpose=True)
|
|
loaded = paddle.arange(32, dtype="float32").reshape([8, 4]) # [out, in], transpose -> [in, out]
|
|
|
|
q_local = layer._extract_local_shard(param, loaded, "q")
|
|
assert list(q_local.shape) == [4, 4]
|
|
|
|
expected = loaded.transpose([1, 0])[:, 4:8]
|
|
assert bool(paddle.allclose(q_local, expected))
|
|
|
|
def test_to_hidden_major_and_pack_paths(self):
|
|
layer = self._build_layer()
|
|
# q_out=8, kv_out=4 for current head setup.
|
|
q = paddle.randn([8, 3], dtype="float32") # [out, hidden] -> should transpose
|
|
k = paddle.randn([4, 3], dtype="float32")
|
|
v = paddle.randn([4, 3], dtype="float32")
|
|
|
|
packed_out_major = layer._pack_pf_interleaved_local(q, k, v, output_dim=False)
|
|
assert list(packed_out_major.shape) == [16, 3]
|
|
|
|
with pytest.raises(ValueError, match="Expected 2D"):
|
|
layer._to_hidden_major(paddle.randn([2], dtype="float32"), 2, "q")
|
|
with pytest.raises(ValueError, match="Cannot normalize"):
|
|
layer._to_hidden_major(paddle.randn([3, 5], dtype="float32"), 4, "q")
|
|
|
|
def test_split_pf_fused_qkv_and_weight_loader_pending_finalize(self):
|
|
layer = self._build_layer(model_format="paddle")
|
|
|
|
class DummyParam:
|
|
def __init__(self, shape, output_dim=True):
|
|
self.shape = shape
|
|
self.output_dim = output_dim
|
|
self.weight_need_transpose = False
|
|
self.dtype = paddle.float32
|
|
self._initialized = False
|
|
self.saved = None
|
|
|
|
def _is_initialized(self):
|
|
return self._initialized
|
|
|
|
def initialize(self):
|
|
self._initialized = True
|
|
|
|
def set_value(self, value):
|
|
self.saved = value
|
|
|
|
# split fused weight path
|
|
fused_weight = paddle.randn([3, 16], dtype="float32")
|
|
q, k, v = layer._split_pf_fused_qkv(fused_weight, is_bias=False)
|
|
assert list(q.shape) == [3, 8]
|
|
assert list(k.shape) == [3, 4]
|
|
assert list(v.shape) == [3, 4]
|
|
|
|
fused_bias = paddle.randn([16], dtype="float32")
|
|
qb, kb, vb = layer._split_pf_fused_qkv(fused_bias, is_bias=True)
|
|
assert list(qb.shape) == [8]
|
|
assert list(kb.shape) == [4]
|
|
assert list(vb.shape) == [4]
|
|
|
|
# pending -> finalize path
|
|
param = DummyParam(shape=[3, 16], output_dim=True)
|
|
layer.weight_loader(param, q, "q")
|
|
assert bool(getattr(param, "_pf_qkv_pending", False))
|
|
layer.weight_loader(param, k, "k")
|
|
assert bool(getattr(param, "_pf_qkv_pending", False))
|
|
layer.weight_loader(param, v, "v")
|
|
assert not bool(getattr(param, "_pf_qkv_pending", False))
|
|
assert param.saved is not None
|
|
assert list(param.saved.shape) == [3, 16]
|
|
|
|
# direct fused qkv in non-paddle format should be rejected.
|
|
layer_torch = self._build_layer(model_format="torch")
|
|
with pytest.raises(ValueError, match="only supported for model_format='paddle'"):
|
|
layer_torch.weight_loader(param, fused_weight, None)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|