support ep+tp at op layer (#4688)

2026-04-23 00:17:25 +08:00 · 2025-11-05 11:15:57 +08:00
parent 937eb3c6ed
commit 2fd254e5b7
8 changed files with 138 additions and 105 deletions
@@ -1156,6 +1156,13 @@ class XPUModelRunner(ModelRunnerBase):
        # 1. Prepare inputs of model and decoder.
        self._prepare_inputs(is_dummy_run=is_dummy_run)

+        # NOTE(wufeisheng): If `not_need_stop`` is False, it means the current worker is in an idle state.
+        # This logic is not used in TP (Tensor Parallelism) mode. However, in EP (Expert Parallelism) mode,
+        # when there is data on other runner, the current runner is required to execute part of the model.
+        if not self.not_need_stop() and not is_dummy_run:
+            self._execute_empty_input()
+            return None
+
        # 2. Padding inputs for cuda grph

        # 3. Execute model
@@ -1225,6 +1232,17 @@ class XPUModelRunner(ModelRunnerBase):

        return None

+    def _execute_empty_input(self) -> None:
+        """
+        In certain scenarios, such as during EP,
+        the runner needs to execute partial modules of the model without input data.
+        This requires the model to implement the `empty_input_forward` method.
+        """
+        if hasattr(self.model, "empty_input_forward"):
+            self.model.empty_input_forward()
+        else:
+            raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
+
    @profile_run_guard(True)
    def profile_run(self) -> None:
        """Execute a forward pass with dummy inputs to profile the memory usage of the model"""