mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] use phi permute/unpermute & rm swiglu (#6361)
* tp文字输出正常 * B eb5 mini文字输出正常 * eb5mini ep B卡 文字输出正常 * default use phi moe op * stash * tp H卡正常 * ep ok * rm debug * rm debug tool * rm del ffn_out * rm swiglu * add envs to swiglu * merge dev * fix ci baseline Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix ci baseline 2 --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -185,7 +185,7 @@ jobs:
|
||||
-d "{\"messages\": [{\"role\": \"user\", \"content\": \"1+1=?\"}], \"logprobs\": true}"
|
||||
set +e
|
||||
rm -rf ./baseline_output
|
||||
cp -r baseline_dev/ERNIE-4.5-0.3B-Paddle ./baseline_output
|
||||
cp -r baseline_dev_0311/ERNIE-4.5-0.3B-Paddle ./baseline_output
|
||||
LOGPROB_EXIT_CODE=0
|
||||
python3.10 lanucher.py --request_template TOKEN_LOGPROB --url http://localhost:${FD_API_PORT}/v1/chat/completions --case ./cases/demo.yaml --concurrency 1 --name demo --exe logprob || LOGPROB_EXIT_CODE=$?
|
||||
echo "LOGPROB_EXIT_CODE=${LOGPROB_EXIT_CODE}" > /workspace/exit_code.env
|
||||
|
||||
@@ -200,6 +200,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"FD_XPU_ENABLE_MIXED_EP_MODE": lambda: bool(int(os.getenv("FD_XPU_ENABLE_MIXED_EP_MODE", "0"))),
|
||||
# Whether to use phi FP8 quantization,if 1,use paddle default.
|
||||
"FD_USE_PHI_FP8_QUANT": lambda: bool(int(os.getenv("FD_USE_PHI_FP8_QUANT", "1"))),
|
||||
# Whether to use phi MOE permute,if 1,use paddle default.
|
||||
"FD_USE_PHI_MOE_PERMUTE": lambda: bool(int(os.getenv("FD_USE_PHI_MOE_PERMUTE", "1"))),
|
||||
# Control class SiluAndMul to use swiglu or fusid_bias_act operator in the forward_cuda function
|
||||
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
|
||||
# Reserve output blocks for decoding requests when schedule new prefill requests
|
||||
"FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL": lambda: int(
|
||||
os.getenv("FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL", "16")
|
||||
|
||||
@@ -20,6 +20,7 @@ import paddle
|
||||
from paddle import nn
|
||||
from paddle.incubate.nn.functional import fused_bias_act, swiglu
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
@@ -120,7 +121,7 @@ class SiluAndMul(nn.Layer):
|
||||
Returns:
|
||||
Tensor: Output tensor.
|
||||
"""
|
||||
if self.bias is None and self.quant_scale == -1:
|
||||
if self.bias is None and self.quant_scale == -1 and envs.FD_SiluAndMul_USE_PHI_SWIGLU:
|
||||
return paddle.nn.functional.swiglu(x)
|
||||
return fused_bias_act(
|
||||
x,
|
||||
|
||||
@@ -151,7 +151,8 @@ def m_grouped_fp8_gemm_nt_contiguous_custom_python_op(
|
||||
(permute_input.shape[0], layer_added_weight_attrs_0.shape[1]),
|
||||
dtype=paddle.bfloat16,
|
||||
)
|
||||
if disable_ue8m0_cast:
|
||||
# if disable_ue8m0_cast:
|
||||
if permute_scale.strides[0] != 1:
|
||||
permute_scale = permute_scale.transpose([1, 0]).contiguous()
|
||||
permute_scale = permute_scale.transpose([1, 0])
|
||||
# disable_ue8m0_cast is False for SM100
|
||||
@@ -487,31 +488,52 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
elif token_all_num > 0:
|
||||
logger.debug(f"token_all_num {token_all_num}")
|
||||
|
||||
token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
|
||||
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
|
||||
recv_topk_idx = recv_topk_idx.astype(paddle.int32)
|
||||
(
|
||||
permute_input,
|
||||
permute_indices_per_token, # == zipped_expertwise_rowmap
|
||||
dst_weights,
|
||||
permute_scale,
|
||||
m_indices,
|
||||
) = paddle.nn.functional.moe_permute(
|
||||
hidden_states=recv_x,
|
||||
scale=recv_x_scale,
|
||||
expert_routemap_topk=recv_topk_idx,
|
||||
expert_prob_topk=recv_topk_weights,
|
||||
num_experts=layer.num_local_experts,
|
||||
tokens_per_expert=[],
|
||||
padding_alignment=128,
|
||||
return_expert_indices=True,
|
||||
override_buffer_size=token_all_num,
|
||||
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
)
|
||||
else:
|
||||
token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
|
||||
(
|
||||
permute_input,
|
||||
permute_scale,
|
||||
permute_indices_per_token,
|
||||
recv_num_tokens_per_expert_list_cumsum,
|
||||
recv_num_tokens_per_expert_list_padded_cumsum,
|
||||
dst_weights,
|
||||
dst_indices,
|
||||
cumsum_idx_gpu,
|
||||
m_indices,
|
||||
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
|
||||
recv_x,
|
||||
recv_x_scale,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
token_nums_this_rank[0],
|
||||
token_nums_this_rank[1],
|
||||
True, # use_in_ep
|
||||
token_all_num,
|
||||
)
|
||||
|
||||
(
|
||||
permute_input,
|
||||
permute_scale,
|
||||
permute_indices_per_token,
|
||||
_,
|
||||
_,
|
||||
dst_weights,
|
||||
dst_indices,
|
||||
_,
|
||||
m_indices,
|
||||
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
|
||||
recv_x_value,
|
||||
recv_x_scale,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
token_nums_this_rank[0],
|
||||
token_nums_this_rank[1],
|
||||
True, # use_in_ep
|
||||
token_all_num,
|
||||
)
|
||||
assert permute_input.shape[0] == token_all_num
|
||||
|
||||
if not self.quant_config.deepgemm_scale_ue8m0:
|
||||
if permute_scale.strides[0] != 1:
|
||||
permute_scale = permute_scale.transpose([1, 0]).contiguous().transpose([1, 0])
|
||||
|
||||
# up_gate_proj
|
||||
@@ -553,20 +575,30 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
ffn_out,
|
||||
m_indices,
|
||||
)
|
||||
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
|
||||
tmp_ffn_out, out_probs = paddle.nn.functional.moe_unpermute(
|
||||
hidden_states_unzipped=ffn_out,
|
||||
zipped_expertwise_rowmap=permute_indices_per_token,
|
||||
expert_routemap_topk=recv_topk_idx,
|
||||
token_prob_unzipped=dst_weights,
|
||||
total_zipped_tokens=recv_x.shape[0],
|
||||
num_experts=layer.num_local_experts,
|
||||
using_weighted_combine=True,
|
||||
)
|
||||
|
||||
# prmt back per rank
|
||||
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
dst_weights,
|
||||
permute_indices_per_token,
|
||||
dst_indices,
|
||||
None, # down_proj_bias
|
||||
False, # norm_topk_prob
|
||||
1.0,
|
||||
)
|
||||
else:
|
||||
# prmt back per rank
|
||||
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
dst_weights,
|
||||
permute_indices_per_token,
|
||||
dst_indices,
|
||||
None, # down_proj_bias
|
||||
False, # norm_topk_prob
|
||||
1.0,
|
||||
)
|
||||
else:
|
||||
tmp_ffn_out = paddle.empty([0, hidden_size], paddle.bfloat16)
|
||||
|
||||
# 5. EP combine
|
||||
event = deep_ep.Buffer.capture()
|
||||
if self.ep_prefill_runner.num_worst_tokens <= 0:
|
||||
@@ -697,14 +729,11 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
if topk_ids_hookfunc is not None:
|
||||
topk_ids_hookfunc(topk_ids=topk_ids)
|
||||
|
||||
tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
|
||||
|
||||
if not fastdeploy.envs.FD_USE_PHI_FP8_QUANT:
|
||||
recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
|
||||
x, 128, self.quant_config.deepgemm_scale_ue8m0
|
||||
)
|
||||
else:
|
||||
|
||||
recv_x, recv_x_scale = paddle.incubate.nn.functional.fp8_quant_blockwise(
|
||||
x,
|
||||
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
@@ -717,26 +746,49 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
else recv_x_scale.T[: recv_x.shape[0]]
|
||||
)
|
||||
|
||||
(
|
||||
permute_input,
|
||||
permute_scale,
|
||||
permute_indices_per_token,
|
||||
recv_num_tokens_per_expert_list_cumsum,
|
||||
recv_num_tokens_per_expert_list_padded_cumsum,
|
||||
dst_weights,
|
||||
dst_indices,
|
||||
cumsum_idx_gpu,
|
||||
m_indices,
|
||||
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
|
||||
recv_x,
|
||||
recv_x_scale,
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
tmp[0],
|
||||
tmp[1],
|
||||
False, # use_in_ep
|
||||
-1,
|
||||
)
|
||||
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
|
||||
topk_ids = topk_ids.astype(paddle.int32)
|
||||
override_buffer_size = recv_x.shape[0] * layer.top_k + layer.num_experts * (128 - 1)
|
||||
(
|
||||
permute_input,
|
||||
permute_indices_per_token, # == zipped_expertwise_rowmap
|
||||
dst_weights,
|
||||
permute_scale,
|
||||
m_indices,
|
||||
) = paddle.nn.functional.moe_permute(
|
||||
hidden_states=recv_x,
|
||||
scale=recv_x_scale,
|
||||
expert_routemap_topk=topk_ids,
|
||||
expert_prob_topk=topk_weights,
|
||||
num_experts=layer.num_experts,
|
||||
tokens_per_expert=[],
|
||||
padding_alignment=128,
|
||||
return_expert_indices=True,
|
||||
override_buffer_size=override_buffer_size,
|
||||
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
)
|
||||
else:
|
||||
tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
|
||||
(
|
||||
permute_input,
|
||||
permute_scale,
|
||||
permute_indices_per_token,
|
||||
recv_num_tokens_per_expert_list_cumsum,
|
||||
recv_num_tokens_per_expert_list_padded_cumsum,
|
||||
dst_weights,
|
||||
dst_indices,
|
||||
cumsum_idx_gpu,
|
||||
m_indices,
|
||||
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
|
||||
recv_x,
|
||||
recv_x_scale,
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
tmp[0],
|
||||
tmp[1],
|
||||
False, # use_in_ep
|
||||
-1,
|
||||
)
|
||||
|
||||
ffn_out = m_grouped_fp8_gemm_nt_contiguous_custom_python_op(
|
||||
permute_input,
|
||||
@@ -751,14 +803,24 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
)
|
||||
|
||||
# prmt back per rank
|
||||
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
dst_weights,
|
||||
permute_indices_per_token,
|
||||
dst_indices,
|
||||
None,
|
||||
False, # norm_topk_prob
|
||||
1.0,
|
||||
)
|
||||
|
||||
if fastdeploy.envs.FD_USE_PHI_MOE_PERMUTE:
|
||||
tmp_ffn_out, out_probs = paddle.nn.functional.moe_unpermute(
|
||||
hidden_states_unzipped=ffn_out,
|
||||
zipped_expertwise_rowmap=permute_indices_per_token,
|
||||
expert_routemap_topk=topk_ids,
|
||||
token_prob_unzipped=dst_weights,
|
||||
total_zipped_tokens=recv_x.shape[0],
|
||||
num_experts=layer.num_experts,
|
||||
using_weighted_combine=True,
|
||||
)
|
||||
else:
|
||||
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
dst_weights,
|
||||
permute_indices_per_token,
|
||||
dst_indices,
|
||||
None,
|
||||
False, # norm_topk_prob
|
||||
1.0,
|
||||
)
|
||||
return tmp_ffn_out
|
||||
|
||||
@@ -25,6 +25,7 @@ from paddleformers.transformers import PretrainedModel
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import (
|
||||
support_graph_optimization,
|
||||
@@ -161,6 +162,7 @@ class Glm4Moe(nn.Layer):
|
||||
|
||||
self.experts = FusedMoE(
|
||||
fd_config,
|
||||
reduce_results=False,
|
||||
renormalize=self.norm_topk_prob,
|
||||
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
|
||||
num_experts=fd_config.model_config.n_routed_experts,
|
||||
@@ -181,14 +183,21 @@ class Glm4Moe(nn.Layer):
|
||||
intermediate_size=shared_experts_intermediate_size,
|
||||
layer_id=layer_id,
|
||||
prefix=f"{prefix}.shared_experts",
|
||||
reduce_results=False,
|
||||
)
|
||||
|
||||
def forward(self, x, forward_meta: ForwardMeta = None):
|
||||
# Both experts and shared_experts return partial sums (no all-reduce).
|
||||
# Combine them first, then do a single all-reduce — eliminating one
|
||||
# collective communication compared to the naive sequential approach.
|
||||
# NOTE: only valid for pure-TP mode (use_ep=False). In EP or EP+TP modes
|
||||
# FusedMoE uses all-to-all internally and already produces a full result,
|
||||
# so the extra all-reduce must be skipped to avoid double-reduction.
|
||||
out = self.experts(x, self.gate, forward_meta)
|
||||
if self.n_shared_experts > 0:
|
||||
shared_experts_out = self.shared_experts(x)
|
||||
out = out + shared_experts_out
|
||||
|
||||
out = out + self.shared_experts(x)
|
||||
if self.use_tp and not self.use_ep:
|
||||
out = tensor_model_parallel_all_reduce(out, self.tp_group)
|
||||
return out
|
||||
|
||||
|
||||
@@ -535,7 +544,9 @@ class Glm4MoeForCausalLM(ModelForCasualLM):
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
""" """
|
||||
paddle.cuda.nvtx.range_push("GLM4_MOE_BF")
|
||||
hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta)
|
||||
paddle.cuda.nvtx.range_pop()
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
@@ -25,10 +25,10 @@ def test_unstream_with_logprobs():
|
||||
# 校验返回内容与概率信息
|
||||
assert resp_json["choices"][0]["message"]["content"] == "牛顿的"
|
||||
assert resp_json["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
|
||||
assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
|
||||
assert resp_json["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
|
||||
assert resp_json["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
|
||||
"token": "牛顿",
|
||||
"logprob": -0.03113006055355072,
|
||||
"logprob": -0.031025361269712448,
|
||||
"bytes": [231, 137, 155, 233, 161, 191],
|
||||
"top_logprobs": None,
|
||||
}
|
||||
@@ -102,10 +102,10 @@ def test_stream_with_logprobs():
|
||||
# 校验概率字段
|
||||
assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.03113006055355072
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.031025361269712448
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
|
||||
"token": "牛顿",
|
||||
"logprob": -0.03113006055355072,
|
||||
"logprob": -0.031025361269712448,
|
||||
"bytes": [231, 137, 155, 233, 161, 191],
|
||||
}
|
||||
|
||||
@@ -187,10 +187,10 @@ def test_stream_with_temp_scaled_logprobs():
|
||||
# 校验概率字段
|
||||
assert result_chunk["choices"][0]["delta"]["content"] == "牛顿"
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["token"] == "牛顿"
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.0068125599063932896
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["logprob"] == -0.006811376195400953
|
||||
assert result_chunk["choices"][0]["logprobs"]["content"][0]["top_logprobs"][0] == {
|
||||
"token": "牛顿",
|
||||
"logprob": -0.0068125599063932896,
|
||||
"logprob": -0.006811376195400953,
|
||||
"bytes": [231, 137, 155, 233, 161, 191],
|
||||
}
|
||||
|
||||
|
||||
@@ -205,9 +205,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
|
||||
# base result
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0115")
|
||||
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0311")
|
||||
else:
|
||||
base_file = "ernie-4_5-vl-base-tp2-dev-0113"
|
||||
base_file = "ernie-4_5-vl-base-tp2-dev-0311"
|
||||
with open(base_file, "r") as f:
|
||||
content2 = f.read()
|
||||
|
||||
|
||||
@@ -185,7 +185,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload):
|
||||
# 校验返回内容与概率信息
|
||||
assert (
|
||||
resp_json["choices"][0]["message"]["content"]
|
||||
== "\n<think>这个问题是关于牛顿的三大运动定律的。牛顿的三大运动定律是经典"
|
||||
== "\n<think>我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典"
|
||||
), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}."
|
||||
|
||||
|
||||
|
||||
@@ -182,7 +182,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload):
|
||||
# 校验返回内容与概率信息
|
||||
assert (
|
||||
resp_json["choices"][0]["message"]["content"]
|
||||
== "\n<think>我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典"
|
||||
== "\n<think>这个问题是关于牛顿的三大运动定律的。牛顿的三大运动定律是经典"
|
||||
), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}."
|
||||
|
||||
|
||||
|
||||
@@ -328,9 +328,9 @@ def test_text_diff(api_url):
|
||||
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "21b_ep4_text_baseline.txt")
|
||||
base_file = os.path.join(base_path, "21b_ep4_text_baseline_dev_0311.txt")
|
||||
else:
|
||||
base_file = "21b_ep4_text_baseline.txt"
|
||||
base_file = "21b_ep4_text_baseline_dev_0311.txt"
|
||||
with open(base_file, "r", encoding="utf-8") as f:
|
||||
baseline = f.read()
|
||||
|
||||
@@ -596,9 +596,9 @@ def test_non_stream_with_logprobs(api_url):
|
||||
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "21b_ep4_logprobs_non_stream_static_baseline.txt")
|
||||
base_file = os.path.join(base_path, "21b_ep4_logprobs_non_stream_static_baseline_dev_0311.txt")
|
||||
else:
|
||||
base_file = "21b_ep4_logprobs_non_stream_static_baseline.txt"
|
||||
base_file = "21b_ep4_logprobs_non_stream_static_baseline_dev_0311.txt"
|
||||
with open(base_file, "r", encoding="utf-8") as f:
|
||||
baseline = json.load(f)
|
||||
|
||||
@@ -629,9 +629,9 @@ def test_stream_with_logprobs(api_url):
|
||||
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "21b_ep4_logprobs_stream_static_baseline.txt")
|
||||
base_file = os.path.join(base_path, "21b_ep4_logprobs_stream_static_baseline_dev_0311.txt")
|
||||
else:
|
||||
base_file = "21b_ep4_logprobs_stream_static_baseline.txt"
|
||||
base_file = "21b_ep4_logprobs_stream_static_baseline_dev_0311.txt"
|
||||
with open(base_file, "r", encoding="utf-8") as f:
|
||||
baseline = json.load(f)
|
||||
|
||||
|
||||
@@ -337,9 +337,9 @@ def test_text_diff(api_url):
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "21b_ep4_mtp_text_baseline.txt")
|
||||
base_file = os.path.join(base_path, "21b_ep4_mtp_text_baseline_dev_0311.txt")
|
||||
else:
|
||||
base_file = "21b_ep4_mtp_text_baseline.txt"
|
||||
base_file = "21b_ep4_mtp_text_baseline_dev_0311.txt"
|
||||
|
||||
with open(base_file, "r", encoding="utf-8") as f:
|
||||
baseline = f.read()
|
||||
@@ -504,9 +504,9 @@ def test_non_stream_with_logprobs(api_url):
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_non_stream_static_baseline.txt")
|
||||
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_non_stream_static_baseline_dev_0311.txt")
|
||||
else:
|
||||
base_file = "21b_ep4_mtp_logprobs_non_stream_static_baseline.txt"
|
||||
base_file = "21b_ep4_mtp_logprobs_non_stream_static_baseline_dev_0311.txt"
|
||||
|
||||
with open(base_file, "r", encoding="utf-8") as f:
|
||||
baseline = json.load(f)
|
||||
@@ -539,9 +539,9 @@ def test_stream_with_logprobs(api_url):
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_stream_static_baseline.txt")
|
||||
base_file = os.path.join(base_path, "21b_ep4_mtp_logprobs_stream_static_baseline_dev_0311.txt")
|
||||
else:
|
||||
base_file = "21b_ep4_mtp_logprobs_stream_static_baseline.txt"
|
||||
base_file = "21b_ep4_mtp_logprobs_stream_static_baseline_dev_0311.txt"
|
||||
|
||||
with open(base_file, "r", encoding="utf-8") as f:
|
||||
baseline = json.load(f)
|
||||
|
||||
@@ -204,9 +204,9 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
|
||||
# base result
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
if base_path:
|
||||
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0115")
|
||||
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-dev-0311")
|
||||
else:
|
||||
base_file = "ernie-4_5-vl-base-tp2-dev-0113"
|
||||
base_file = "ernie-4_5-vl-base-tp2-dev-0311"
|
||||
with open(base_file, "r") as f:
|
||||
content2 = f.read()
|
||||
|
||||
|
||||
@@ -179,7 +179,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
|
||||
f_o.close()
|
||||
|
||||
# base result
|
||||
content2 = "这张图片展示了一群人在进行手工艺活动。前景中有两个孩子和一个成年人,他们似乎在制作或展示某种手工艺品。成年人手里拿着一个扇子,上面有彩色的图案,可能是通过某种方式绘制或涂鸦而成。孩子们看起来很专注,可能是在观察或参与这个过程。\n\n背景中还有其他几个人,其中一个人穿着粉色的衣服,背对着镜头。整个场景看起来像是在一个室内环境中,光线充足,氛围轻松愉快。"
|
||||
content2 = "这张图片展示了一群人在进行某种活动。前景中有两个孩子和一个成年人,他们似乎在观看或参与某个艺术创作过程。成年人手里拿着一个扇子,上面有各种颜色的颜料,看起来像是在指导孩子们如何使用颜料。孩子们的表情专注,似乎对这个活动很感兴趣。背景中还有其他人在进行类似的活动,环境看起来像是在一个室内空间,可能是教室或工作室。整体氛围显得非常温馨和积极。"
|
||||
|
||||
# Verify that result is same as the base result
|
||||
assert content1 == content2
|
||||
|
||||
@@ -173,7 +173,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
|
||||
content1 = result1["choices"][0]["message"]["content"]
|
||||
|
||||
# base result
|
||||
content2 = "视频中手机支架的颜色是黑色。"
|
||||
content2 = "视频中手机支架的颜色是黑色的。"
|
||||
|
||||
# Verify that result is same as the base result
|
||||
assert content1.startswith(content2), content1
|
||||
|
||||
@@ -339,16 +339,16 @@ def test_mtp_accept_ratio(api_url):
|
||||
print("\nresult:\n", result)
|
||||
|
||||
base_path = os.getenv("MODEL_PATH")
|
||||
baseline_path = os.path.join(base_path, "21b_mtp_accept_ratio_baseline_dev.txt")
|
||||
baseline_path = os.path.join(base_path, "21b_mtp_accept_ratio_baseline_dev_0311.txt")
|
||||
with open(baseline_path, "r", encoding="utf-8") as f:
|
||||
baseline = f.read()
|
||||
baseline_ratio = {
|
||||
"accepted_tokens": 139,
|
||||
"accepted_tokens": 131,
|
||||
"rejected_tokens": 23,
|
||||
"accept_ratio": 0.41726618705035967,
|
||||
"average_accept_length": 1.7160493827160495,
|
||||
"accepted_tokens_per_head": [81, 58],
|
||||
"accept_ratio_per_head": [0.7160493827160493],
|
||||
"accept_ratio": 0.4122137404580153,
|
||||
"average_accept_length": 1.7012987012987013,
|
||||
"accepted_tokens_per_head": [77, 54],
|
||||
"accept_ratio_per_head": [0.7012987012987013],
|
||||
}
|
||||
|
||||
response = send_request(url=api_url, payload=payload)
|
||||
|
||||
@@ -157,10 +157,10 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode
|
||||
model_path = os.getenv("MODEL_PATH")
|
||||
if model_path:
|
||||
baseline_path = os.path.join(
|
||||
model_path, f"R3_BaseLine_dev_uint8_0205/routing_replay_output_baseline_{model_name}"
|
||||
model_path, f"R3_BaseLine_dev_uint8_0311/routing_replay_output_baseline_{model_name}"
|
||||
)
|
||||
else:
|
||||
baseline_path = f"./R3_BaseLine_dev_uint8_0205/routing_replay_output_baseline_{model_name}"
|
||||
baseline_path = f"./R3_BaseLine_dev_uint8_0311/routing_replay_output_baseline_{model_name}"
|
||||
stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream")
|
||||
|
||||
nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")
|
||||
|
||||
@@ -84,11 +84,8 @@ class TestSiluAndMul(unittest.TestCase):
|
||||
layer = SiluAndMul(fd_config)
|
||||
x = paddle.ones([2, 2])
|
||||
out = layer.forward(x)
|
||||
if layer.bias is None and layer.quant_scale == -1:
|
||||
self.assertTrue((out.numpy() == 0.73105854).all())
|
||||
else:
|
||||
self.assertTrue((out.numpy() == 1).all())
|
||||
mock_fused.assert_called_once()
|
||||
self.assertTrue((out.numpy() == 1).all())
|
||||
mock_fused.assert_called_once()
|
||||
|
||||
# Test forward computation on GCU platform
|
||||
@patch(
|
||||
|
||||
@@ -146,7 +146,7 @@ def test_model_against_baseline(
|
||||
|
||||
# Get baseline suffix from config
|
||||
model_config = hugging_face_model_param_map.get(model_name_or_path, {})
|
||||
baseline_suffix = model_config.get("baseline_suffix", "tp2-dev-0226")
|
||||
baseline_suffix = model_config.get("baseline_suffix", "tp2-dev-0311")
|
||||
baseline_filename = f"{model_name_or_path}-{baseline_suffix}"
|
||||
|
||||
if base_path:
|
||||
|
||||
Reference in New Issue
Block a user