[Optim] Robust sync status when preempted happens (#5796)

* [Bug fix] Sync status for caching output cache * fix * fix * fix bug * fix * fix * support xpu * fix * fix * fix * fix * fix * fix ci * fix ci * fix xpu --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2026-01-14 12:07:33 +08:00
parent 0d1a5e70bc
commit 74d0f1c01f
17 changed files with 442 additions and 354 deletions
@@ -30,104 +30,108 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
                                const paddle::Tensor& not_need_stop,
                                const paddle::Tensor& seq_lens_decoder,
                                const paddle::Tensor& prompt_lens,
+                                const paddle::Tensor& preempted_idx,
                                int64_t rank_id,
                                int msg_queue_id,
                                int save_each_rank,
                                bool skip_prefill) {
-    // printf("enter save output");
-    if (!save_each_rank && rank_id > 0) {
-        return;
-    }
-
-    int max_draft_tokens = accept_tokens.shape()[1];
-
-    auto accept_tokens_cpu = accept_tokens.copy_to(paddle::CPUPlace(), true);
-    auto accept_num_cpu = accept_num.copy_to(paddle::CPUPlace(), true);
-    int64_t* accept_tokens_data = accept_tokens_cpu.data<int64_t>();
-    int* accept_num_data = accept_num_cpu.data<int>();
-
-    auto seq_lens_decoder_cpu = seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
-    auto prompt_lens_cpu = prompt_lens.copy_to(paddle::CPUPlace(), true);
-    int* seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
-    int64_t* prompt_lens_data = prompt_lens_cpu.data<int64_t>();
-
-    if (const char* inference_msg_queue_id_env_p =
-            std::getenv("INFERENCE_MSG_QUEUE_ID")) {
-        std::string inference_msg_queue_id_env_str(
-            inference_msg_queue_id_env_p);
-        int inference_msg_queue_id_from_env =
-            std::stoi(inference_msg_queue_id_env_str);
-#ifdef GET_OUTPUT_DEBUG
-        std::cout << "Your INFERENCE_MSG_QUEUE_ID is: "
-                  << inference_msg_queue_id_from_env << std::endl;
-#endif
-        msg_queue_id = inference_msg_queue_id_from_env;
-    }
-    static struct speculate_msgdata msg_sed;
-    static key_t key = ftok("./", msg_queue_id);
-    static int msgid = msgget(key, IPC_CREAT | 0666);
-
-    msg_sed.mtype = 1;
-    bool not_need_stop_data = not_need_stop.data<bool>()[0];
-
-    int inference_msg_id_from_env = 1;
-    if (const char* inference_msg_id_env_p = std::getenv("INFERENCE_MSG_ID")) {
-        std::string inference_msg_id_env_str(inference_msg_id_env_p);
-        inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
-        if (inference_msg_id_from_env == 2) {
-            // 2 and -2 is preserve for no-output indication.
-            throw std::runtime_error(
-                " INFERENCE_MSG_ID cannot be 2, please use other number.");
-        }
-        if (inference_msg_id_from_env < 0) {
-            throw std::runtime_error(
-                " INFERENCE_MSG_ID cannot be negative, please use other "
-                "number.");
-        }
-
-#ifdef SAVE_WITH_OUTPUT_DEBUG
-        std::cout << "Your INFERENCE_MSG_ID is: " << inference_msg_id_from_env
-                  << std::endl;
-#endif
-    } else {
-#ifdef SAVE_WITH_OUTPUT_DEBUG
-        std::cout
-            << "Failed to got INFERENCE_MSG_ID at env, use (int)1 as default."
-            << std::endl;
-#endif
-    }
-
-    msg_sed.mtext[0] = not_need_stop_data ? inference_msg_id_from_env
-                                          : -inference_msg_id_from_env;
-    int bsz = accept_tokens.shape()[0];
-    msg_sed.mtext[1] = bsz;
-
-    for (int i = 2; i < MAX_BSZ + 2; i++) {
-        if (i - 2 >= bsz || (skip_prefill && seq_lens_decoder_data[i - 2] < prompt_lens_data[i - 2])) {
-            msg_sed.mtext[i] = 0;
-        } else {
-            msg_sed.mtext[i] = (int)accept_num_data[i - 2];
-        }
-    }
-    for (int i = MAX_BSZ + 2; i < MAX_BSZ * MAX_DRAFT_TOKENS + MAX_BSZ + 2;
-         i++) {
-        int token_id = i - MAX_BSZ - 2;
-        int bid = token_id / MAX_DRAFT_TOKENS;
-        int local_token_id = token_id % MAX_DRAFT_TOKENS;
-        if (token_id / MAX_DRAFT_TOKENS >= bsz) {
-            msg_sed.mtext[i] = 0;
-        } else {
-            msg_sed.mtext[i] =
-                accept_tokens_data[bid * max_draft_tokens + local_token_id];
-        }
-    }
-    if ((msgsnd(msgid,
-                &msg_sed,
-                (MAX_BSZ * MAX_DRAFT_TOKENS + MAX_BSZ + 2) * 4,
-                0)) == -1) {
-        printf("full msg buffer\n");
-    }
+  // printf("enter save output");
+  if (!save_each_rank && rank_id > 0) {
    return;
+  }
+
+  int max_draft_tokens = accept_tokens.shape()[1];
+
+  auto accept_tokens_cpu = accept_tokens.copy_to(paddle::CPUPlace(), true);
+  auto accept_num_cpu = accept_num.copy_to(paddle::CPUPlace(), true);
+  int64_t* accept_tokens_data = accept_tokens_cpu.data<int64_t>();
+  int* accept_num_data = accept_num_cpu.data<int>();
+
+  auto seq_lens_decoder_cpu =
+      seq_lens_decoder.copy_to(paddle::CPUPlace(), true);
+  auto prompt_lens_cpu = prompt_lens.copy_to(paddle::CPUPlace(), true);
+  int* seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
+  int64_t* prompt_lens_data = prompt_lens_cpu.data<int64_t>();
+  const int32_t* preempted_idx_data = preempted_idx.data<int32_t>();
+
+  if (const char* inference_msg_queue_id_env_p =
+          std::getenv("INFERENCE_MSG_QUEUE_ID")) {
+    std::string inference_msg_queue_id_env_str(inference_msg_queue_id_env_p);
+    int inference_msg_queue_id_from_env =
+        std::stoi(inference_msg_queue_id_env_str);
+#ifdef GET_OUTPUT_DEBUG
+    std::cout << "Your INFERENCE_MSG_QUEUE_ID is: "
+              << inference_msg_queue_id_from_env << std::endl;
+#endif
+    msg_queue_id = inference_msg_queue_id_from_env;
+  }
+  static struct speculate_msgdata msg_sed;
+  static key_t key = ftok("./", msg_queue_id);
+  static int msgid = msgget(key, IPC_CREAT | 0666);
+
+  msg_sed.mtype = 1;
+  bool not_need_stop_data = not_need_stop.data<bool>()[0];
+
+  int inference_msg_id_from_env = 1;
+  if (const char* inference_msg_id_env_p = std::getenv("INFERENCE_MSG_ID")) {
+    std::string inference_msg_id_env_str(inference_msg_id_env_p);
+    inference_msg_id_from_env = std::stoi(inference_msg_id_env_str);
+    if (inference_msg_id_from_env == 2) {
+      // 2 and -2 is preserve for no-output indication.
+      throw std::runtime_error(
+          " INFERENCE_MSG_ID cannot be 2, please use other number.");
+    }
+    if (inference_msg_id_from_env < 0) {
+      throw std::runtime_error(
+          " INFERENCE_MSG_ID cannot be negative, please use other "
+          "number.");
+    }
+
+#ifdef SAVE_WITH_OUTPUT_DEBUG
+    std::cout << "Your INFERENCE_MSG_ID is: " << inference_msg_id_from_env
+              << std::endl;
+#endif
+  } else {
+#ifdef SAVE_WITH_OUTPUT_DEBUG
+    std::cout << "Failed to got INFERENCE_MSG_ID at env, use (int)1 as default."
+              << std::endl;
+#endif
+  }
+
+  msg_sed.mtext[0] = not_need_stop_data ? inference_msg_id_from_env
+                                        : -inference_msg_id_from_env;
+  int bsz = accept_tokens.shape()[0];
+  msg_sed.mtext[1] = bsz;
+
+  for (int i = 2; i < MAX_BSZ + 2; i++) {
+    if (i - 2 >= bsz || (skip_prefill && seq_lens_decoder_data[i - 2] <
+                                             prompt_lens_data[i - 2])) {
+      msg_sed.mtext[i] = 0;
+    } else {
+      msg_sed.mtext[i] = (int)accept_num_data[i - 2];
+    }
+    if (i - 2 < bsz && preempted_idx_data[i - 2] == 1) {
+      msg_sed.mtext[i] = -9;
+    }
+  }
+  for (int i = MAX_BSZ + 2; i < MAX_BSZ * MAX_DRAFT_TOKENS + MAX_BSZ + 2; i++) {
+    int token_id = i - MAX_BSZ - 2;
+    int bid = token_id / MAX_DRAFT_TOKENS;
+    int local_token_id = token_id % MAX_DRAFT_TOKENS;
+    if (token_id / MAX_DRAFT_TOKENS >= bsz) {
+      msg_sed.mtext[i] = 0;
+    } else {
+      msg_sed.mtext[i] =
+          accept_tokens_data[bid * max_draft_tokens + local_token_id];
+    }
+  }
+  if ((msgsnd(msgid,
+              &msg_sed,
+              (MAX_BSZ * MAX_DRAFT_TOKENS + MAX_BSZ + 2) * 4,
+              0)) == -1) {
+    printf("full msg buffer\n");
+  }
+  return;
 }

 void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens,
@@ -135,11 +139,20 @@ void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens,
                                      const paddle::Tensor& not_need_stop,
                                      const paddle::Tensor& seq_lens_decoder,
                                      const paddle::Tensor& prompt_lens,
+                                      const paddle::Tensor& preempted_idx,
                                      int64_t rank_id,
                                      bool save_each_rank,
                                      bool skip_prefill) {
-    SpeculateSaveWithOutputMsg(
-        accept_tokens, accept_num, not_need_stop, seq_lens_decoder, prompt_lens, rank_id, 1, save_each_rank, skip_prefill);
+  SpeculateSaveWithOutputMsg(accept_tokens,
+                             accept_num,
+                             not_need_stop,
+                             seq_lens_decoder,
+                             prompt_lens,
+                             preempted_idx,
+                             rank_id,
+                             1,
+                             save_each_rank,
+                             skip_prefill);
 }

 void SpeculateSaveWithOutputMsgDynamic(const paddle::Tensor& accept_tokens,
@@ -147,24 +160,46 @@ void SpeculateSaveWithOutputMsgDynamic(const paddle::Tensor& accept_tokens,
                                       const paddle::Tensor& not_need_stop,
                                       const paddle::Tensor& seq_lens_decoder,
                                       const paddle::Tensor& prompt_lens,
+                                       const paddle::Tensor& preempted_idx,
                                       int64_t rank_id,
                                       int msg_queue_id,
                                       bool save_each_rank,
                                       bool skip_prefill) {
-    SpeculateSaveWithOutputMsg(
-        accept_tokens, accept_num, not_need_stop, seq_lens_decoder, prompt_lens, rank_id, msg_queue_id, save_each_rank, skip_prefill);
+  SpeculateSaveWithOutputMsg(accept_tokens,
+                             accept_num,
+                             not_need_stop,
+                             seq_lens_decoder,
+                             prompt_lens,
+                             preempted_idx,
+                             rank_id,
+                             msg_queue_id,
+                             save_each_rank,
+                             skip_prefill);
 }

 PD_BUILD_STATIC_OP(speculate_save_output)
-    .Inputs({"accept_tokens", "accept_num", "not_need_stop", "seq_lens_decoder", "prompt_lens"})
+    .Inputs({"accept_tokens",
+             "accept_num",
+             "not_need_stop",
+             "seq_lens_decoder",
+             "prompt_lens",
+             "preempted_idx"})
    .Attrs({"rank_id: int64_t", "save_each_rank: bool", "skip_prefill: bool"})
    .Outputs({"x_out"})
    .SetInplaceMap({{"accept_tokens", "x_out"}})
    .SetKernelFn(PD_KERNEL(SpeculateSaveWithOutputMsgStatic));

 PD_BUILD_STATIC_OP(speculate_save_output_dynamic)
-    .Inputs({"accept_tokens", "accept_num", "not_need_stop", "seq_lens_decoder", "prompt_lens"})
-    .Attrs({"rank_id: int64_t", "msg_queue_id: int", "save_each_rank: bool", "skip_prefill: bool"})
+    .Inputs({"accept_tokens",
+             "accept_num",
+             "not_need_stop",
+             "seq_lens_decoder",
+             "prompt_lens",
+             "preempted_idx"})
+    .Attrs({"rank_id: int64_t",
+            "msg_queue_id: int",
+            "save_each_rank: bool",
+            "skip_prefill: bool"})
    .Outputs({"x_out"})
    .SetInplaceMap({{"accept_tokens", "x_out"}})
    .SetKernelFn(PD_KERNEL(SpeculateSaveWithOutputMsgDynamic));
@@ -48,6 +48,7 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids,
                              const paddle::Tensor& not_need_stop,
                              const paddle::Tensor& seq_lens_decoder,
                              const paddle::Tensor& prompt_lens,
+                              const paddle::Tensor& preempted_idx,
                              int message_flag,  // Target: 3, Draft: 4
                              int64_t rank_id,
                              bool save_each_rank) {
@@ -79,6 +80,7 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids,
  int* cu_batch_token_offset_data = cu_batch_token_offset_cpu.data<int>();
  int* seq_lens_decoder_data = seq_lens_decoder_cpu.data<int>();
  int64_t* prompt_lens_data = prompt_lens_cpu.data<int64_t>();
+  const int32_t* preempted_idx_data = preempted_idx.data<int32_t>();

  static struct msgdata msg_sed;
  int msg_queue_id = 1;
@@ -142,6 +144,10 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids,
      cur_token_num = token_num_per_batch_data[i];
    }
    msg_sed.meta[3 + i] = cur_token_num;
+    if (preempted_idx_data[i] == 1) {
+      msg_sed.meta[3 + i] = -9;
+    }
+
    auto* cur_batch_msg_sed = &msg_sed.mtext[i];
    int token_offset = cu_batch_token_offset_data[i];
    for (int j = 0; j < cur_token_num; j++) {
@@ -196,16 +202,15 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids,
 }

 PD_BUILD_STATIC_OP(speculate_save_output_topk)
-    .Inputs({
-        "sampled_token_ids",
-        "logprob_token_ids",
-        "logprob_scores",
-        "logprob_ranks",
-        "token_num_per_batch",
-        "cu_batch_token_offset",
-        "not_need_stop",
-        "seq_lens_decoder",
-        "prompt_lens",
-    })
+    .Inputs({"sampled_token_ids",
+             "logprob_token_ids",
+             "logprob_scores",
+             "logprob_ranks",
+             "token_num_per_batch",
+             "cu_batch_token_offset",
+             "not_need_stop",
+             "seq_lens_decoder",
+             "prompt_lens",
+             "preempted_idx"})
    .Attrs({"message_flag: int", "rank_id: int64_t", "save_each_rank: bool"})
    .SetKernelFn(PD_KERNEL(SpeculateSaveOutMmsgTopK));