mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[PD Disaggregation] Prefill and decode support cache storage (#6768)
* Prefill and decode support cache storage * up * up * update docs and refine mooncake store * up
This commit is contained in:
@@ -1,57 +1,31 @@
|
||||
# MooncakeStore for FastDeploy
|
||||
# Global Cache Pooling Examples
|
||||
|
||||
This document describes how to use MooncakeStore as the backend of FastDeploy.
|
||||
This directory contains example scripts for Global Cache Pooling with MooncakeStore.
|
||||
|
||||
## Preparation
|
||||
## Documentation
|
||||
|
||||
### Install FastDeploy
|
||||
- [English Documentation](../../docs/features/global_cache_pooling.md)
|
||||
- [中文文档](../../docs/zh/features/global_cache_pooling.md)
|
||||
|
||||
Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) for Fastdeploy installation.
|
||||
|
||||
### Install MooncakeStore
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
pip install mooncake-transfer-engine
|
||||
```
|
||||
|
||||
## Run Examples
|
||||
|
||||
The example script is provided in `run.sh`. You can run it directly:
|
||||
```
|
||||
# Multi-instance scenario
|
||||
bash run.sh
|
||||
|
||||
# PD disaggregation scenario
|
||||
bash run_03b_pd_storage.sh
|
||||
```
|
||||
|
||||
In the example script, we will start a Mooncake master server and two FastDeploy server.
|
||||
## Scripts
|
||||
|
||||
Launch Mooncake master server:
|
||||
```bash
|
||||
mooncake_master \
|
||||
--port=15001 \
|
||||
--enable_http_metadata_server=true \
|
||||
--http_metadata_server_host=0.0.0.0 \
|
||||
--http_metadata_server_port=15002 \
|
||||
--metrics_port=15003 \
|
||||
```
|
||||
| Script | Scenario | Description |
|
||||
|--------|----------|-------------|
|
||||
| `run.sh` | Multi-Instance | Two standalone instances sharing cache |
|
||||
| `run_03b_pd_storage.sh` | PD Disaggregation | P+D instances with global cache pooling |
|
||||
|
||||
More parameter can be found in the [official guide](https://github.com/kvcache-ai/Mooncake/blob/main/docs/source/python-api-reference/transfer-engine.md).
|
||||
## Files
|
||||
|
||||
Launch the Fastdeploy with Mooncake enabled.
|
||||
|
||||
```bash
|
||||
export MOONCAKE_CONFIG_PATH="./mooncake_config.json"
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port ${PORT} \
|
||||
--metrics-port $((PORT + 1)) \
|
||||
--engine-worker-queue-port $((PORT + 2)) \
|
||||
--cache-queue-port $((PORT + 3)) \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--kvcache-storage-backend mooncake
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
For more details, please refer to:
|
||||
https://github.com/kvcache-ai/Mooncake/blob/main/docs/source/troubleshooting/troubleshooting.md
|
||||
- `mooncake_config.json` - Mooncake configuration file
|
||||
- `utils.sh` - Utility functions for scripts
|
||||
- `stop.sh` - Stop all running services
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
{
|
||||
"local_hostname":"localhost",
|
||||
"metadata_server":"http://0.0.0.0:15002/metadata",
|
||||
"global_segment_size":8589934592,
|
||||
"global_segment_size":1000000000,
|
||||
"local_buffer_size":134217728,
|
||||
"protocol":"rdma",
|
||||
"rdma_devices": "",
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
set -e
|
||||
|
||||
export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
# export MODEL_NAME="/work/models/PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
export MOONCAKE_CONFIG_PATH=./mooncake_config.json
|
||||
export FD_DEBUG=1
|
||||
|
||||
@@ -15,8 +16,8 @@ S0_PORT=52700
|
||||
S1_PORT=52800
|
||||
|
||||
ports=(
|
||||
$S0_PORT $((S0_PORT + 1)) $((S0_PORT + 2)) $((S0_PORT + 3))
|
||||
$S1_PORT $((S1_PORT + 1)) $((S1_PORT + 2)) $((S1_PORT + 3))
|
||||
$S0_PORT
|
||||
$S1_PORT
|
||||
)
|
||||
check_ports "${ports[@]}" || {
|
||||
echo "❌ Some ports are in use. Please release them."
|
||||
@@ -41,9 +42,6 @@ echo "server 0 port: ${S0_PORT}"
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port ${S0_PORT} \
|
||||
--metrics-port $((S0_PORT + 1)) \
|
||||
--engine-worker-queue-port $((S0_PORT + 2)) \
|
||||
--cache-queue-port $((S0_PORT + 3)) \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--kvcache-storage-backend mooncake \
|
||||
@@ -58,9 +56,6 @@ echo "server 1 port: ${S1_PORT}"
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port ${S1_PORT} \
|
||||
--metrics-port $((S1_PORT + 1)) \
|
||||
--engine-worker-queue-port $((S1_PORT + 2)) \
|
||||
--cache-queue-port $((S1_PORT + 3)) \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--kvcache-storage-backend mooncake \
|
||||
|
||||
@@ -0,0 +1,219 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# =============================================================================
|
||||
# PD Disaggregation + Global Cache Pooling Test Script
|
||||
# Reference: start_v1_tp1.sh (PD Disaggregation) + run.sh (Mooncake Cache Pooling)
|
||||
# Note: Modify CUDA_VISIBLE_DEVICES environment variables for PD instances
|
||||
# =============================================================================
|
||||
|
||||
# ======================== Environment Variables Configuration ========================
|
||||
export MODEL_NAME="/work/models/PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
|
||||
export FD_DEBUG=1
|
||||
|
||||
# Mooncake Configuration (using environment variables)
|
||||
master_ip="127.0.0.1"
|
||||
master_port=15001
|
||||
metadata_port=15002
|
||||
|
||||
export MOONCAKE_MASTER_SERVER_ADDR="${master_ip}:${master_port}"
|
||||
export MOONCAKE_METADATA_SERVER="http://${master_ip}:${metadata_port}/metadata"
|
||||
export MOONCAKE_GLOBAL_SEGMENT_SIZE="50000000000"
|
||||
# export MOONCAKE_PROTOCOL="tcp"
|
||||
export MOONCAKE_PROTOCOL="rdma"
|
||||
# export MOONCAKE_RDMA_DEVICES="mlx5_0"
|
||||
|
||||
# ======================== Port Configuration ========================
|
||||
P_PORT=52400
|
||||
D_PORT=52500
|
||||
ROUTER_PORT=52700
|
||||
LOG_DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
# ======================== Cleanup and Preparation ========================
|
||||
unset http_proxy && unset https_proxy
|
||||
rm -rf log_*
|
||||
|
||||
source ./utils.sh
|
||||
|
||||
# Check ports
|
||||
ports=($P_PORT $D_PORT $ROUTER_PORT $master_port $metadata_port)
|
||||
check_ports "${ports[@]}" || {
|
||||
echo "❌ Some ports are in use. Please release them."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# ======================== Start Mooncake Master ========================
|
||||
echo "=== Starting Mooncake Master ==="
|
||||
export FD_LOG_DIR="log_master"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
|
||||
nohup mooncake_master \
|
||||
--port=${master_port} \
|
||||
--enable_http_metadata_server=true \
|
||||
--http_metadata_server_host=0.0.0.0 \
|
||||
--http_metadata_server_port=${metadata_port} \
|
||||
2>&1 > ${FD_LOG_DIR}/nohup &
|
||||
|
||||
sleep 2 # Wait for Mooncake Master to start
|
||||
|
||||
# ======================== Start Router ========================
|
||||
echo "=== Starting Router ==="
|
||||
export FD_LOG_DIR="log_router"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
echo "Router log: ${FD_LOG_DIR}, port: ${ROUTER_PORT}"
|
||||
|
||||
nohup python -m fastdeploy.router.launch \
|
||||
--port ${ROUTER_PORT} \
|
||||
--splitwise \
|
||||
2>&1 > ${FD_LOG_DIR}/nohup &
|
||||
|
||||
sleep 2 # Wait for Router to start
|
||||
|
||||
# ======================== Start P Instance (Prefill) ========================
|
||||
echo "=== Starting Prefill Instance ==="
|
||||
export CUDA_VISIBLE_DEVICES=3
|
||||
export FD_LOG_DIR="log_prefill"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
echo "Prefill log: ${FD_LOG_DIR}, port: ${P_PORT}, GPU: ${CUDA_VISIBLE_DEVICES}"
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port ${P_PORT} \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--splitwise-role prefill \
|
||||
--cache-transfer-protocol rdma \
|
||||
--router "0.0.0.0:${ROUTER_PORT}" \
|
||||
--kvcache-storage-backend mooncake \
|
||||
2>&1 > ${FD_LOG_DIR}/nohup &
|
||||
|
||||
|
||||
# ======================== Start D Instance (Decode) ========================
|
||||
echo "=== Starting Decode Instance ==="
|
||||
export CUDA_VISIBLE_DEVICES=7
|
||||
export FD_LOG_DIR="log_decode"
|
||||
mkdir -p ${FD_LOG_DIR}
|
||||
echo "Decode log: ${FD_LOG_DIR}, port: ${D_PORT}, GPU: ${CUDA_VISIBLE_DEVICES}"
|
||||
|
||||
nohup python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_NAME} \
|
||||
--port ${D_PORT} \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 32 \
|
||||
--splitwise-role decode \
|
||||
--cache-transfer-protocol rdma \
|
||||
--router "0.0.0.0:${ROUTER_PORT}" \
|
||||
--enable-output-caching \
|
||||
--kvcache-storage-backend mooncake \
|
||||
2>&1 > ${FD_LOG_DIR}/nohup &
|
||||
|
||||
|
||||
# ======================== Wait for Services to be Ready ========================
|
||||
echo "=== Waiting for services to be ready ==="
|
||||
wait_for_health ${P_PORT}
|
||||
wait_for_health ${D_PORT}
|
||||
|
||||
# Wait for services to register to Router
|
||||
sleep 10
|
||||
echo "✅ All services are ready!"
|
||||
|
||||
# ======================== Send Test Requests ========================
|
||||
# Test scenario: Multi-turn conversation, verify that output cache written by D instance can be read by P instance
|
||||
#
|
||||
# Flow:
|
||||
# 1. Request 1: Send first round question, D instance generates answer and writes to global cache (prompt + output)
|
||||
# 2. Request 2: Send second round conversation (first round Q&A + follow-up), P instance should hit global cache for first round's complete KV cache
|
||||
#
|
||||
echo ""
|
||||
echo "=== Multi-turn Conversation Test for Global Cache Pooling ==="
|
||||
|
||||
# First round question
|
||||
msg1="深圳是中国经济实力最强的城市之一。近年来,深圳 GDP 持续稳步增长,2023 年突破 3.4 万亿元人民币,2024 年接近 3.7 万亿元,长期位居全国城市前列。深圳经济以第二产业和第三产业为主,高端制造业、电子信息产业和现代服务业发达,形成了以科技创新为核心的产业结构。依托华为、腾讯、大疆等龙头企业,深圳在数字经济、人工智能、新能源等领域具有显著优势。同时,深圳进出口总额常年位居全国城市第一,是中国对外开放和高质量发展的重要引擎。深圳2024年 GDP 是多少?"
|
||||
|
||||
echo ""
|
||||
echo ">>> Request 1: First round question"
|
||||
echo " Purpose: D instance generates output and writes to global cache (prompt + output)"
|
||||
echo ""
|
||||
|
||||
# Send first round request and get response
|
||||
response1=$(curl -s -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"messages\": [
|
||||
{\"role\": \"user\", \"content\": \"${msg1}\"}
|
||||
],
|
||||
\"max_tokens\": 200,
|
||||
\"min_tokens\": 130,
|
||||
\"stream\": false,
|
||||
\"top_p\": 0
|
||||
}")
|
||||
|
||||
echo "Response 1:"
|
||||
echo "${response1}" | python3 -m json.tool 2>/dev/null || echo "${response1}"
|
||||
|
||||
# Extract first round response content
|
||||
assistant_reply=$(echo "${response1}" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data['choices'][0]['message']['content'])" 2>/dev/null || echo "")
|
||||
|
||||
if [ -z "${assistant_reply}" ]; then
|
||||
echo "❌ Failed to get response from Request 1"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# JSON escape assistant_reply to prevent newlines, quotes, and other special characters from breaking JSON format
|
||||
assistant_reply_escaped=$(python3 -c "import json,sys; print(json.dumps(sys.stdin.read().strip()))" <<< "${assistant_reply}")
|
||||
|
||||
echo ""
|
||||
echo "Assistant reply extracted: ${assistant_reply}..."
|
||||
|
||||
# Wait for D instance to write output cache to global storage
|
||||
echo ""
|
||||
echo ">>> Waiting for D instance to write output cache to global storage..."
|
||||
sleep 5
|
||||
|
||||
# Second round follow-up question
|
||||
msg2="那深圳2023年的GDP是多少?和2024年相比增长了多少?"
|
||||
|
||||
echo ""
|
||||
echo ">>> Request 2: Second round (multi-turn conversation)"
|
||||
echo " Purpose: P instance should hit global cache including D's output from Request 1"
|
||||
echo " Check log_prefill/nohup for 'storage_match' to verify cache hit"
|
||||
echo ""
|
||||
|
||||
# Send second round request (including complete multi-turn conversation history)
|
||||
response2=$(curl -s -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"messages\": [
|
||||
{\"role\": \"user\", \"content\": \"${msg1}\"},
|
||||
{\"role\": \"assistant\", \"content\": ${assistant_reply_escaped}},
|
||||
{\"role\": \"user\", \"content\": \"${msg2}\"}
|
||||
],
|
||||
\"max_tokens\": 100,
|
||||
\"stream\": false,
|
||||
\"top_p\": 0
|
||||
}")
|
||||
|
||||
echo "Response 2:"
|
||||
echo "${response2}" | python3 -m json.tool 2>/dev/null || echo "${response2}"
|
||||
|
||||
# Extract second round response content and display
|
||||
assistant_reply2=$(echo "${response2}" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data['choices'][0]['message']['content'])" 2>/dev/null || echo "")
|
||||
echo ""
|
||||
echo "Assistant reply 2: ${assistant_reply2}"
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
echo "=== Test completed ==="
|
||||
echo ""
|
||||
echo "Verification Steps:"
|
||||
echo "1. Check log_prefill/nohup for Request 2's cache hit info:"
|
||||
echo " grep -E 'storage_match|cache_hit|matched.*block' log_prefill/nohup"
|
||||
echo ""
|
||||
echo "2. If 'storage_match_token_num > 0' in Request 2, it means P instance"
|
||||
echo " successfully read the output cache written by D instance from Request 1"
|
||||
echo ""
|
||||
echo "Log files:"
|
||||
echo " - Prefill: log_prefill/nohup"
|
||||
echo " - Decode: log_decode/nohup"
|
||||
echo " - Router: log_router/nohup"
|
||||
echo " - Master: log_master/nohup"
|
||||
Reference in New Issue
Block a user