mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
Use triton qk_norm both in Prefill and Decode (#7213)
Co-authored-by: “liuruian” <liuruian@baidu.com>
This commit is contained in:
@@ -341,7 +341,7 @@ class QKRMSNorm(nn.Layer):
|
||||
forward_meta,
|
||||
proxy_rmsnorm=None,
|
||||
) -> paddle.Tensor:
|
||||
if proxy_rmsnorm is None and self.qk_norm_fused and forward_meta.step_use_cudagraph:
|
||||
if proxy_rmsnorm is None and self.qk_norm_fused:
|
||||
qkv_out = qk_rmsnorm_fused(
|
||||
qkv_out,
|
||||
self.q_norm.weight,
|
||||
|
||||
@@ -173,7 +173,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
|
||||
content1 = result1["choices"][0]["message"]["content"]
|
||||
|
||||
# base result
|
||||
content2 = "视频中手机支架的颜色是黑色的。"
|
||||
content2 = "视频中手机支架的颜色是黑色。"
|
||||
|
||||
# Verify that result is same as the base result
|
||||
assert content1.startswith(content2), content1
|
||||
|
||||
Reference in New Issue
Block a user