[PD Disaggregation] support different tp_size for prefill and decode (#5296)

* up * up * up * fix
2026-04-24 01:29:57 +08:00 · 2025-12-01 17:50:20 +08:00
parent 54119cf07e
commit 0925d44f18
13 changed files with 584 additions and 36 deletions
@@ -142,6 +142,7 @@ struct Connection {
  int wc_target_count;

  // Configuration
+  int decode_tp_size;
  int layer_number;
  int block_number;
  int block_byte_size;
@@ -24,11 +24,15 @@ class RDMACommunicator {
                   std::vector<int64_t> local_key_cache,
                   std::vector<int64_t> local_value_cache,
                   int block_number,
-                   int block_bytes);
+                   int block_bytes,
+                   int prefill_tp_size,
+                   int prefill_tp_idx);
  ~RDMACommunicator();

  // Connection management
-  int connect(const std::string& dst_ip, const std::string& dst_port);
+  int connect(const std::string& dst_ip,
+              const std::string& dst_port,
+              int dest_tp_size);
  bool is_connected(const std::string& dst_ip, const std::string& dst_port);

  // Core functionality
@@ -120,6 +124,8 @@ class RDMACommunicator {
  int block_number;                       // Number of blocks
  int block_size_byte;                    // Size of each block in bytes
  int layer_number;                       // Number of layers
+  int prefill_tp_size;                    // tensor parallelism size for prefill
+  int prefill_tp_idx;  // tensor parallelism index for prefill

  std::vector<std::vector<void*>>
      local_cache_key_ptr_per_layer;  // Per-layer key pointers