[Feature] use phi permute/unpermute & rm swiglu (#6361)

* tp文字输出正常

* B eb5 mini文字输出正常

* eb5mini ep B卡 文字输出正常

* default use phi moe op

* stash

* tp H卡正常

* ep ok

* rm debug

* rm debug tool

* rm del ffn_out

* rm swiglu

* add envs to swiglu

* merge dev

* fix ci baseline

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix ci baseline 2

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
fxyfxy777
2026-03-12 17:01:57 +08:00
committed by GitHub
parent a3d7979711
commit 250ce40b40
18 changed files with 187 additions and 112 deletions
+1 -1
View File
@@ -185,7 +185,7 @@ jobs:
-d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}"
set +e
rm -rf ./baseline_output
cp -r baseline_dev/ERNIE-4.5-0.3B-Paddle ./baseline_output
cp -r baseline_dev_0311/ERNIE-4.5-0.3B-Paddle ./baseline_output
LOGPROB_EXIT_CODE=0
python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions --case ./cases/demo.yaml --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$?
echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env
+4
View File
@@ -200,6 +200,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_XPU_ENABLE_MIXED_EP_MODE": lambda: bool(int(os.getenv("FD_XPU_ENABLE_MIXED_EP_MODE", "0"))),
# Whether to use phi FP8 quantization,if 1,use paddle default.
"FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))),
# Whether to use phi MOE permute,if 1,use paddle default.
"FD_USE_PHI_MOE_PERMUTE": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_PERMUTE", "1"))),
# Control class SiluAndMul to use swiglu or fusid_bias_act operator in the forward_cuda function
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
# Reserve output blocks for decoding requests when schedule new prefill requests
"FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL": lambda: int(
os.getenv("FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL", "16")
@@ -20,6 +20,7 @@ import paddle
from paddle import nn
from paddle.incubate.nn.functional import fused_bias_act, swiglu
from fastdeploy import envs
from fastdeploy.config import FDConfig
from fastdeploy.platforms import current_platform
@@ -120,7 +121,7 @@ class SiluAndMul(nn.Layer):
Returns:
Tensor: Output tensor.
"""
if self.bias is None and self.quant_scale == -1:
if self.bias is None and self.quant_scale == -1 and envs.FD_SiluAndMul_USE_PHI_SWIGLU:
return paddle.nn.functional.swiglu(x)
return fused_bias_act(
x,
@@ -151,7 +151,8 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op(
(permute_input.shape[0], layer_added_weight_attrs_0.shape[1]),
dtype=paddle.bfloat16,
)
if disable_ue8m0_cast:
# if disable_ue8m0_cast:
if permute_scale.strides[0] != 1:
permute_scale = permute_scale.transpose([1, 0]).contiguous()
permute_scale = permute_scale.transpose([1, 0])
# disable_ue8m0_cast is False for SM100
@@ -487,31 +488,52 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
elif token_all_num > 0:
logger.debug(f"token_all_num {token_all_num}")
token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
recv_topk_idx = recv_topk_idx.astype(paddle.int32)
(
permute_input,
permute_indices_per_token, # == zipped_expertwise_rowmap
dst_weights,
permute_scale,
m_indices,
) = paddle.nn.functional.moe_permute(
hidden_states=recv_x,
scale=recv_x_scale,
expert_routemap_topk=recv_topk_idx,
expert_prob_topk=recv_topk_weights,
num_experts=layer.num_local_experts,
tokens_per_expert=[],
padding_alignment=128,
return_expert_indices=True,
override_buffer_size=token_all_num,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
else:
token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
(
permute_input,
permute_scale,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
recv_num_tokens_per_expert_list_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x,
recv_x_scale,
recv_topk_idx,
recv_topk_weights,
token_nums_this_rank[0],
token_nums_this_rank[1],
True, # use_in_ep
token_all_num,
)
(
permute_input,
permute_scale,
permute_indices_per_token,
_,
_,
dst_weights,
dst_indices,
_,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x_value,
recv_x_scale,
recv_topk_idx,
recv_topk_weights,
token_nums_this_rank[0],
token_nums_this_rank[1],
True, # use_in_ep
token_all_num,
)
assert permute_input.shape[0] == token_all_num
if not self.quant_config.deepgemm_scale_ue8m0:
if permute_scale.strides[0] != 1:
permute_scale = permute_scale.transpose([1, 0]).contiguous().transpose([1, 0])
# up_gate_proj
@@ -553,20 +575,30 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
ffn_out,
m_indices,
)
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
tmp_ffn_out, out_probs = paddle.nn.functional.moe_unpermute(
hidden_states_unzipped=ffn_out,
zipped_expertwise_rowmap=permute_indices_per_token,
expert_routemap_topk=recv_topk_idx,
token_prob_unzipped=dst_weights,
total_zipped_tokens=recv_x.shape[0],
num_experts=layer.num_local_experts,
using_weighted_combine=True,
)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None, # down_proj_bias
False, # norm_topk_prob
1.0,
)
else:
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None, # down_proj_bias
False, # norm_topk_prob
1.0,
)
else:
tmp_ffn_out = paddle.empty([0, hidden_size], paddle.bfloat16)
# 5. EP combine
event = deep_ep.Buffer.capture()
if self.ep_prefill_runner.num_worst_tokens <= 0:
@@ -697,14 +729,11 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
if topk_ids_hookfunc is not None:
topk_ids_hookfunc(topk_ids=topk_ids)
tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
if not fastdeploy.envs.FD_USE_PHI_FP8_QUANT:
recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
x, 128, self.quant_config.deepgemm_scale_ue8m0
)
else:
recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
x,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
@@ -717,26 +746,49 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
else recv_x_scale.T[: recv_x.shape[0]]
)
(
permute_input,
permute_scale,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
recv_num_tokens_per_expert_list_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x,
recv_x_scale,
topk_ids,
topk_weights,
tmp[0],
tmp[1],
False, # use_in_ep
-1,
)
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
topk_ids = topk_ids.astype(paddle.int32)
override_buffer_size = recv_x.shape[0] * layer.top_k + layer.num_experts * (128 - 1)
(
permute_input,
permute_indices_per_token, # == zipped_expertwise_rowmap
dst_weights,
permute_scale,
m_indices,
) = paddle.nn.functional.moe_permute(
hidden_states=recv_x,
scale=recv_x_scale,
expert_routemap_topk=topk_ids,
expert_prob_topk=topk_weights,
num_experts=layer.num_experts,
tokens_per_expert=[],
padding_alignment=128,
return_expert_indices=True,
override_buffer_size=override_buffer_size,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
else:
tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
(
permute_input,
permute_scale,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
recv_num_tokens_per_expert_list_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x,
recv_x_scale,
topk_ids,
topk_weights,
tmp[0],
tmp[1],
False, # use_in_ep
-1,
)
ffn_out = m_grouped_fp8_gemm_nt_contiguous_custom_python_op(
permute_input,
@@ -751,14 +803,24 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None,
False, # norm_topk_prob
1.0,
)
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
tmp_ffn_out, out_probs = paddle.nn.functional.moe_unpermute(
hidden_states_unzipped=ffn_out,
zipped_expertwise_rowmap=permute_indices_per_token,
expert_routemap_topk=topk_ids,
token_prob_unzipped=dst_weights,
total_zipped_tokens=recv_x.shape[0],
num_experts=layer.num_experts,
using_weighted_combine=True,
)
else:
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None,
False, # norm_topk_prob
1.0,
)
return tmp_ffn_out
+14 -3
View File
@@ -25,6 +25,7 @@ from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import (
support_graph_optimization,
@@ -161,6 +162,7 @@ class Glm4Moe(nn.Layer):
self.experts = FusedMoE(
fd_config,
reduce_results=False,
renormalize=self.norm_topk_prob,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
num_experts=fd_config.model_config.n_routed_experts,
@@ -181,14 +183,21 @@ class Glm4Moe(nn.Layer):
intermediate_size=shared_experts_intermediate_size,
layer_id=layer_id,
prefix=f"{prefix}.shared_experts",
reduce_results=False,
)
def forward(self, x, forward_meta: ForwardMeta = None):
# Both experts and shared_experts return partial sums (no all-reduce).
# Combine them first, then do a single all-reduce — eliminating one
# collective communication compared to the naive sequential approach.
# NOTE: only valid for pure-TP mode (use_ep=False). In EP or EP+TP modes
# FusedMoE uses all-to-all internally and already produces a full result,
# so the extra all-reduce must be skipped to avoid double-reduction.
out = self.experts(x, self.gate, forward_meta)
if self.n_shared_experts > 0:
shared_experts_out = self.shared_experts(x)
out = out + shared_experts_out
out = out + self.shared_experts(x)
if self.use_tp and not self.use_ep:
out = tensor_model_parallel_all_reduce(out, self.tp_group)
return out
@@ -535,7 +544,9 @@ class Glm4MoeForCausalLM(ModelForCasualLM):
forward_meta: ForwardMeta,
):
""" """
paddle.cuda.nvtx.range_push("GLM4_MOE_BF")
hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta)
paddle.cuda.nvtx.range_pop()
return hidden_states
+6 -6
View File
@@ -25,10 +25,10 @@ def test_unstream_with_logprobs():
# 校验返回内容与概率信息
assert resp_json["choices"][0]["message"]["content"] == "牛顿的"
assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
"token": "牛顿",
"logprob": -0.03113006055355072,
"logprob": -0.031025361269712448,
"bytes": [231, 137, 155, 233, 161, 191],
"top_logprobs": None,
}
@@ -102,10 +102,10 @@ def test_stream_with_logprobs():
# 校验概率字段
assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
"token": "牛顿",
"logprob": -0.03113006055355072,
"logprob": -0.031025361269712448,
"bytes": [231, 137, 155, 233, 161, 191],
}
@@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs():
# 校验概率字段
assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953
assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
"token": "牛顿",
"logprob": -0.0068125599063932896,
"logprob": -0.006811376195400953,
"bytes": [231, 137, 155, 233, 161, 191],
}
@@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
# base result
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0115")
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0311")
else:
base_file = "ernie-4_5-vl-base-tp2-dev-0113"
base_file = "ernie-4_5-vl-base-tp2-dev-0311"
with open(base_file, "r") as f:
content2 = f.read()
@@ -185,7 +185,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload):
# 校验返回内容与概率信息
assert (
resp_json["choices"][0]["message"]["content"]
== "\n<think>这个问题是关于牛顿的三大运动定律。牛顿的三大运动定律是经典"
== "\n<think>我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典"
), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}."
@@ -182,7 +182,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload):
# 校验返回内容与概率信息
assert (
resp_json["choices"][0]["message"]["content"]
== "\n<think>我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典"
== "\n<think>这个问题是关于牛顿的三大运动定律。牛顿的三大运动定律是经典"
), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}."
+6 -6
View File
@@ -328,9 +328,9 @@ def test_text_diff(api_url):
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "21b_ep4_text_baseline.txt")
base_file = os.path.join(base_path, "21b_ep4_text_baseline_dev_0311.txt")
else:
base_file = "21b_ep4_text_baseline.txt"
base_file = "21b_ep4_text_baseline_dev_0311.txt"
with open(base_file, "r", encoding="utf-8") as f:
baseline = f.read()
@@ -596,9 +596,9 @@ def test_non_stream_with_logprobs(api_url):
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "21b_ep4_logprobs_non_stream_static_baseline.txt")
base_file = os.path.join(base_path, "21b_ep4_logprobs_non_stream_static_baseline_dev_0311.txt")
else:
base_file = "21b_ep4_logprobs_non_stream_static_baseline.txt"
base_file = "21b_ep4_logprobs_non_stream_static_baseline_dev_0311.txt"
with open(base_file, "r", encoding="utf-8") as f:
baseline = json.load(f)
@@ -629,9 +629,9 @@ def test_stream_with_logprobs(api_url):
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "21b_ep4_logprobs_stream_static_baseline.txt")
base_file = os.path.join(base_path, "21b_ep4_logprobs_stream_static_baseline_dev_0311.txt")
else:
base_file = "21b_ep4_logprobs_stream_static_baseline.txt"
base_file = "21b_ep4_logprobs_stream_static_baseline_dev_0311.txt"
with open(base_file, "r", encoding="utf-8") as f:
baseline = json.load(f)
@@ -337,9 +337,9 @@ def test_text_diff(api_url):
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "21b_ep4_mtp_text_baseline.txt")
base_file = os.path.join(base_path, "21b_ep4_mtp_text_baseline_dev_0311.txt")
else:
base_file = "21b_ep4_mtp_text_baseline.txt"
base_file = "21b_ep4_mtp_text_baseline_dev_0311.txt"
with open(base_file, "r", encoding="utf-8") as f:
baseline = f.read()
@@ -504,9 +504,9 @@ def test_non_stream_with_logprobs(api_url):
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_non_stream_static_baseline.txt")
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_non_stream_static_baseline_dev_0311.txt")
else:
base_file = "21b_ep4_mtp_logprobs_non_stream_static_baseline.txt"
base_file = "21b_ep4_mtp_logprobs_non_stream_static_baseline_dev_0311.txt"
with open(base_file, "r", encoding="utf-8") as f:
baseline = json.load(f)
@@ -539,9 +539,9 @@ def test_stream_with_logprobs(api_url):
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_stream_static_baseline.txt")
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_stream_static_baseline_dev_0311.txt")
else:
base_file = "21b_ep4_mtp_logprobs_stream_static_baseline.txt"
base_file = "21b_ep4_mtp_logprobs_stream_static_baseline_dev_0311.txt"
with open(base_file, "r", encoding="utf-8") as f:
baseline = json.load(f)
+2 -2
View File
@@ -204,9 +204,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
# base result
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0115")
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0311")
else:
base_file = "ernie-4_5-vl-base-tp2-dev-0113"
base_file = "ernie-4_5-vl-base-tp2-dev-0311"
with open(base_file, "r") as f:
content2 = f.read()
+1 -1
View File
@@ -179,7 +179,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
f_o.close()
# base result
content2 = "这张图片展示了一群人在进行手工艺活动。前景中有两个孩子和一个成年人,他们似乎在制作或展示某种手工艺品。成年人手里拿着一个扇子,上面有彩色的图案,可能是通过某种方式绘制或涂鸦而成。孩子们看起来很专注,可能是在观察或参与这个过程。\n\n背景中还有其他几个人,其中一个人穿着粉色的衣服,背对着镜头。整个场景看起来像是在一个室内环境中,光线充足,氛围轻松愉快"
content2 = "这张图片展示了一群人在进行某种活动。前景中有两个孩子和一个成年人,他们似乎在观看或参与某个艺术创作过程。成年人手里拿着一个扇子,上面有各种颜色的颜料,看起来像是在指导孩子们如何使用颜料。孩子们的表情专注,似乎对这个活动很感兴趣。背景中还有其他人在进行类似的活动,环境看起来像是在一个室内空间,可能是教室或工作室。整体氛围显得非常温馨和积极"
# Verify that result is same as the base result
assert content1 == content2
+1 -1
View File
@@ -173,7 +173,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
content1 = result1["choices"][0]["message"]["content"]
# base result
content2 = "视频中手机支架的颜色是黑色。"
content2 = "视频中手机支架的颜色是黑色"
# Verify that result is same as the base result
assert content1.startswith(content2), content1
+6 -6
View File
@@ -339,16 +339,16 @@ def test_mtp_accept_ratio(api_url):
print("\nresult:\n", result)
base_path = os.getenv("MODEL_PATH")
baseline_path = os.path.join(base_path, "21b_mtp_accept_ratio_baseline_dev.txt")
baseline_path = os.path.join(base_path, "21b_mtp_accept_ratio_baseline_dev_0311.txt")
with open(baseline_path, "r", encoding="utf-8") as f:
baseline = f.read()
baseline_ratio = {
"accepted_tokens": 139,
"accepted_tokens": 131,
"rejected_tokens": 23,
"accept_ratio": 0.41726618705035967,
"average_accept_length": 1.7160493827160495,
"accepted_tokens_per_head": [81, 58],
"accept_ratio_per_head": [0.7160493827160493],
"accept_ratio": 0.4122137404580153,
"average_accept_length": 1.7012987012987013,
"accepted_tokens_per_head": [77, 54],
"accept_ratio_per_head": [0.7012987012987013],
}
response = send_request(url=api_url, payload=payload)
@@ -157,10 +157,10 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode
model_path = os.getenv("MODEL_PATH")
if model_path:
baseline_path = os.path.join(
model_path, f"R3_BaseLine_dev_uint8_0205/routing_replay_output_baseline_{model_name}"
model_path, f"R3_BaseLine_dev_uint8_0311/routing_replay_output_baseline_{model_name}"
)
else:
baseline_path = f"./R3_BaseLine_dev_uint8_0205/routing_replay_output_baseline_{model_name}"
baseline_path = f"./R3_BaseLine_dev_uint8_0311/routing_replay_output_baseline_{model_name}"
stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream")
nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")
+2 -5
View File
@@ -84,11 +84,8 @@ class TestSiluAndMul(unittest.TestCase):
layer = SiluAndMul(fd_config)
x = paddle.ones([2, 2])
out = layer.forward(x)
if layer.bias is None and layer.quant_scale == -1:
self.assertTrue((out.numpy() == 0.73105854).all())
else:
self.assertTrue((out.numpy() == 1).all())
mock_fused.assert_called_once()
self.assertTrue((out.numpy() == 1).all())
mock_fused.assert_called_once()
# Test forward computation on GCU platform
@patch(
+1 -1
View File
@@ -146,7 +146,7 @@ def test_model_against_baseline(
# Get baseline suffix from config
model_config = hugging_face_model_param_map.get(model_name_or_path, {})
baseline_suffix = model_config.get("baseline_suffix", "tp2-dev-0226")
baseline_suffix = model_config.get("baseline_suffix", "tp2-dev-0311")
baseline_filename = f"{model_name_or_path}-{baseline_suffix}"
if base_path: