[BugFix]Set default OMP_NUM_THREADS=3 and fix extra GPU memory usage in DeepSeek (#5219)

* fix bug * update * update * update * fix copy * update
2026-04-23 00:17:25 +08:00 · 2025-11-28 14:22:04 +08:00
parent 7dc06cac6e
commit 1539fd6056
6 changed files with 29 additions and 16 deletions
@@ -356,10 +356,6 @@ class MergedReplicatedLinear(ReplicatedLinear):
        self.output_sizes = output_sizes

    def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
-        weight_need_transpose = getattr(param, "weight_need_transpose", False)
-        if weight_need_transpose:
-            loaded_weight = get_tensor(loaded_weight).transpose([1, 0])
-
        assert loaded_shard_id in ["q_a", "kv_a"]
        if not param._is_initialized():
            param.initialize()
@@ -385,7 +381,6 @@ class MergedReplicatedLinear(ReplicatedLinear):
            else:
                loaded_weight = loaded_weight.cast(param.dtype)
        # (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed.
-        loaded_weight = get_tensor(loaded_weight)
        h2d_copy(param, loaded_weight)


@@ -452,7 +447,17 @@ class ColumnParallelLinear(LinearBase):
            if self.with_bias:
                # col parallel
                _set_var_distributed(self.bias, split_axis=1)
-                set_weight_attrs(self.bias, {"output_dim": True})
+                set_weight_attrs(
+                    self.bias,
+                    {
+                        "output_dim": True,
+                        "weight_loader": (
+                            self.weight_loader
+                            if hasattr(self, "weight_loader")
+                            else default_weight_loader(self.fd_config)
+                        ),
+                    },
+                )


 class MergedColumnParallelLinear(ColumnParallelLinear):
@@ -955,7 +960,10 @@ class KVBatchLinear(nn.Layer):
        self.num_heads_per_partition = divide(num_attention_heads, self.tp_size)
        self.local_rank = fd_config.parallel_config.tensor_parallel_rank
        self.fd_config = fd_config
-        self.kv_b_proj = kv_b_proj
+        if self.fd_config.load_config.load_choices == "default_v1":
+            self.kv_b_proj = kv_b_proj
+        else:
+            self.kv_b_proj = None

        self.weight_dtype = self._helper.get_default_dtype()