[Features] support hugging face qwen3 moe (#3649)

* split ut * qwen3-30B-A3B * fix * add test * add test_torch_model.py * fix test_torch_model.py * delete print * fix moe * delete init.py * fix * fix --------- Co-authored-by: bukejiyu <395822456@qq.com> Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2025-08-30 15:26:05 +08:00
parent f206474cc7
commit 455205f991
9 changed files with 437 additions and 258 deletions
@@ -294,6 +294,7 @@ class ReplicatedLinear(LinearBase):
            weight_loader=(
                self.weight_loader if hasattr(self, "weight_loader") else default_weight_loader(self.fd_config)
            ),
+            model_format=fd_config.model_config.model_format,
        )


@@ -446,7 +447,6 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                shard_size = (self.local_rank + 1) * block_size
                loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_size)

-            loaded_weight = get_tensor(loaded_weight)
            if not param._is_initialized():
                param.initialize()
            param_shard_size = output_size // 2
@@ -574,7 +574,6 @@ class QKVParallelLinear(ColumnParallelLinear):
                shard_size = (shard_id + 1) * block_size
                loaded_weight = slice_fn(loaded_weight, output_dim, start=shard_offset, end=shard_size)

-            loaded_weight = get_tensor(loaded_weight)
            if not param._is_initialized():
                param.initialize()