diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index ee2d765d45..1449f69732 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -466,8 +466,9 @@ class XPUMoEMethod(MoEMethodBase): topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) # 2. Dynamic compute blockwise quantization scales - if "a_tokenwise_int8" in self.xpu_moe_quant_type and x.shape[0] > 0: + if "a_tokenwise_int8" in self.xpu_moe_quant_type: x, x_scale = quant2d_per_token(x) + x_scale = x_scale.unsqueeze(1) else: x_scale = None diff --git a/tests/xpu_ci/4cards_cases/test_ep4tp1_online.py b/tests/xpu_ci/4cards_cases/test_ep4tp1_online.py index 489565e4f1..ba9c1df276 100644 --- a/tests/xpu_ci/4cards_cases/test_ep4tp1_online.py +++ b/tests/xpu_ci/4cards_cases/test_ep4tp1_online.py @@ -24,6 +24,7 @@ EP4TP1在线服务测试 - Expert Parallel + Tensor Parallel """ +import os import subprocess import time @@ -96,6 +97,10 @@ def test_ep4tp1_online(xpu_env): # 设置EP环境变量 original_env = setup_ep_env() + # 设置MOE量化环境变量 + os.environ["FD_XPU_MOE_FFN_QUANT_TYPE_MAP"] = "w_channelwise_int4_a_tokenwise_int8:8->53" + print(f"设置环境变量: FD_XPU_MOE_FFN_QUANT_TYPE_MAP={os.environ['FD_XPU_MOE_FFN_QUANT_TYPE_MAP']}") + stop_processes() cleanup_resources() @@ -135,7 +140,7 @@ def test_ep4tp1_online(xpu_env): f"{','.join([str(i) for i in metrics_ports])}", "--args", "--model", - f"{model_path}/ERNIE-4.5-21B-A3B-Paddle", + f"{model_path}/ERNIE-4.5-300B-A47B-Paddle", "--engine-worker-queue-port", f"{','.join([str(i) for i in engine_worker_queue_ports])}", "--max-model-len", @@ -196,6 +201,9 @@ def test_ep4tp1_online(xpu_env): finally: # 恢复环境变量 + if "FD_XPU_MOE_FFN_QUANT_TYPE_MAP" in os.environ: + del os.environ["FD_XPU_MOE_FFN_QUANT_TYPE_MAP"] + print("删除环境变量: FD_XPU_MOE_FFN_QUANT_TYPE_MAP") restore_env(original_env)