mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-05-06 15:40:33 +08:00
[XPU] support XPU VL model inference (#4030)
* [XPU] support XPU VL model inference * fix image op import and device check * rebase develop * fix perf
This commit is contained in:
@@ -183,5 +183,7 @@ class XPUAttentionBackend(AttentionBackend):
|
||||
forward_meta.encoder_batch_map_cpu,
|
||||
forward_meta.decoder_context_len_cpu,
|
||||
forward_meta.decoder_batch_map_cpu,
|
||||
forward_meta.pos_emb_type,
|
||||
self.rope_3d,
|
||||
)
|
||||
return res
|
||||
|
||||
@@ -72,7 +72,7 @@ class XPUMoEMethod(UnquantizedFusedMoEMethod):
|
||||
layer.top_k,
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
if layer.reduce_results and layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
@@ -252,7 +252,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
||||
layer.top_k,
|
||||
False, # moe group, used in deepseek
|
||||
)
|
||||
if layer.tp_size > 1:
|
||||
if layer.reduce_results and layer.tp_size > 1:
|
||||
from fastdeploy.distributed.communication import (
|
||||
tensor_model_parallel_all_reduce,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user