[Feat] ernie4_5_vl_moe support CudaGraph (#3226)

* delete dynamic control flow for decode * coda-style * fix scatter/gather typos and use input stream instead default stream * support 0-Size Tensor * update runner and model * using static mem address as input * fix mem leak * refine code * update mm_buffer * fix typo * fix buffersize * fix unk token * refine code * refine * support other arch * open cudagraph in vlci * fix * update * update * update * fix cmd * update --------- Co-authored-by: aquagull <hongyuh@qq.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com>
2026-04-23 00:17:25 +08:00 · 2025-09-10 13:11:57 +08:00
parent 9d0074a91a
commit 453487d5b0
9 changed files with 207 additions and 98 deletions
@@ -36,6 +36,9 @@ void MoeDispatchKernel(
    paddle::Tensor *topk_idx, paddle::Tensor *expert_idx_per_token) {
  using namespace phi;

+  if (num_rows == 0){
+    return;
+  }
  typedef PDTraits<T> traits_;
  typedef typename traits_::DataType DataType_;
  typedef typename traits_::data_t data_t;
@@ -185,6 +188,15 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
  auto expert_idx_per_token =
      GetEmptyTensor({num_rows * moe_topk}, paddle::DataType::INT32, place);

+  if (token_rows == 0){
+    return {permute_input,
+            tokens_expert_prefix_sum,
+            permute_indices_per_token,
+            topk_weight,
+            topk_idx,
+            expert_idx_per_token};
+  }
+
  switch (input_type) {
  case paddle::DataType::BFLOAT16:
    MoeDispatchKernel<paddle::DataType::BFLOAT16>(
@@ -412,7 +412,9 @@ const auto t_type = (quant_method == "w4a8") ? up_gate_proj_scale.get().dtype()
                    (quant_method == "w4afp8") ? paddle::DataType::BFLOAT16 :
                    permute_input.dtype();
    auto ffn_out = paddle::empty_like(permute_input, t_type);
-
+    if(permute_input.numel() == 0){
+        return ffn_out;
+    }
    switch (t_type) {
        case paddle::DataType::BFLOAT16:
            MoeFFNKernel<paddle::DataType::BFLOAT16>(permute_input,
@@ -59,6 +59,10 @@ paddle::Tensor MoeExpertReduceFunc(

  auto output = GetEmptyTensor({num_rows, hidden_size}, input_type, place);

+  if(num_rows == 0){
+    return output;
+  }
+
  switch (input_type) {
  case paddle::DataType::BFLOAT16:
    MoeReduceKernel<paddle::DataType::BFLOAT16>(