[Optimization] Enable BF16 gate computation for GLM and Qwen (#6457)

* gate bf16 * add gate-fp32 * fix * update baseline * update * update * fix
2026-04-23 00:17:25 +08:00 · 2026-02-27 13:08:46 +08:00
parent edd31e8849
commit 53aaac69da
19 changed files with 95 additions and 28 deletions
@@ -501,6 +501,11 @@ class EngineArgs:
    Flag to specify the dtype of lm_head as FP32. Default is False (Using model default dtype).
    """

+    moe_gate_fp32: bool = False
+    """
+    Flag to specify the dtype of gate as FP32. Default is False (Using model default dtype).
+    """
+
    logits_processors: Optional[List[str]] = None
    """
    A list of FQCNs (Fully Qualified Class Names) of logits processors supported by the service.
@@ -909,6 +914,12 @@ class EngineArgs:
            default=EngineArgs.lm_head_fp32,
            help="Specify the dtype of lm_head weight as float32.",
        )
+        model_group.add_argument(
+            "--moe-gate-fp32",
+            action="store_true",
+            default=EngineArgs.moe_gate_fp32,
+            help="Specify the dtype of gate weight as float32.",
+        )
        model_group.add_argument(
            "--logits-processors",
            type=str,