mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Others] enable use PFCC deep_ep (#5822)
* upstream deep_ep * fix bug * fix bug * modify env name
This commit is contained in:
@@ -20,8 +20,14 @@ import paddle
|
||||
from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy import envs
|
||||
|
||||
try:
|
||||
from paddle.distributed.communication import deep_ep
|
||||
if envs.FD_USE_PFCC_DEEP_EP:
|
||||
paddle.compat.enable_torch_proxy(scope={"deep_ep"}) # Enable torch proxy before importing deep_ep
|
||||
import deep_ep
|
||||
else:
|
||||
from paddle.distributed.communication import deep_ep
|
||||
except:
|
||||
logger.warning("import deep_ep Failed!")
|
||||
|
||||
@@ -280,23 +286,40 @@ class DeepEPEngine:
|
||||
if self.deepep_engine is None:
|
||||
raise RuntimeError("DeepEP buffer not initialized!")
|
||||
|
||||
(
|
||||
packed_recv_x,
|
||||
recv_expert_count,
|
||||
handle,
|
||||
_,
|
||||
dispatch_hook,
|
||||
) = self.deepep_engine.low_latency_dispatch(
|
||||
hidden_states,
|
||||
topk_idx,
|
||||
expertwise_scale,
|
||||
self.buffer.num_max_dispatch_tokens_per_rank,
|
||||
self.num_experts,
|
||||
use_fp8=use_fp8,
|
||||
async_finish=False,
|
||||
return_recv_hook=True,
|
||||
num_per_channel=quant_group_size,
|
||||
)
|
||||
if envs.FD_USE_PFCC_DEEP_EP:
|
||||
(
|
||||
packed_recv_x,
|
||||
recv_expert_count,
|
||||
handle,
|
||||
_,
|
||||
dispatch_hook,
|
||||
) = self.deepep_engine.low_latency_dispatch(
|
||||
hidden_states,
|
||||
topk_idx,
|
||||
self.buffer.num_max_dispatch_tokens_per_rank,
|
||||
self.num_experts,
|
||||
use_fp8=use_fp8,
|
||||
async_finish=False,
|
||||
return_recv_hook=True,
|
||||
)
|
||||
else:
|
||||
(
|
||||
packed_recv_x,
|
||||
recv_expert_count,
|
||||
handle,
|
||||
_,
|
||||
dispatch_hook,
|
||||
) = self.deepep_engine.low_latency_dispatch(
|
||||
hidden_states,
|
||||
topk_idx,
|
||||
expertwise_scale,
|
||||
self.buffer.num_max_dispatch_tokens_per_rank,
|
||||
self.num_experts,
|
||||
use_fp8=use_fp8,
|
||||
async_finish=False,
|
||||
return_recv_hook=True,
|
||||
num_per_channel=quant_group_size,
|
||||
)
|
||||
|
||||
return packed_recv_x, recv_expert_count, handle, dispatch_hook
|
||||
|
||||
|
||||
Reference in New Issue
Block a user