[Feature][BugFix][OP] Enhance Deterministic Inference Mode with Kernel-level Fixes and Batch-invariant BMM (#6610)

* add fa deter * add ut * add long sentence * fix basic * fix bugs * fix adn * fix first * fix single * fix single * fix single test * refine * add more test * refine comments * add comments of bmm * fix ci * remove probe * add * remove not need * refine tests * fix comments and refine code * refine code * refine test * refine test * mv 4cards tests * fix tests * add * fix comments * fix cover * fix cover --------- Co-authored-by: gongweibao <gognweibao@baidu.com>
2026-04-23 00:17:25 +08:00 · 2026-03-09 10:27:53 +08:00
parent 3a85ecf3bc
commit 30f9f33f34
23 changed files with 3563 additions and 153 deletions
@@ -28,8 +28,8 @@ namespace cub = hipcub;
 #include <sys/types.h>
 #include <unistd.h>
 #include <algorithm>
-#include "stdint.h"
 #include "helper.h"
+#include "stdint.h"

 #define FLT_MAX 1e38

@@ -1372,6 +1372,9 @@ std::vector<paddle::Tensor> BeamSearchSoftmax(
                  cudaMemcpyDeviceToHost,
                  cu_stream);

+  // Must synchronize before using host values copied from device
+  cudaStreamSynchronize(cu_stream);
+
  int beam_batch_size = logits_shape[0];
  int batch_size = beam_batch_size / beam_width_scalar;
  int vocab_size = logits_shape[1];