[Intel HPU] enable kv cache scheduler v1 for hpu (#5648)

* [Intel HPU] enable kv cache scheduler v1 for hpu

* fix copilt comments
This commit is contained in:
fmiao2372
2025-12-19 12:03:39 +08:00
committed by GitHub
parent fc452c8e29
commit a8fce47195
6 changed files with 156 additions and 17 deletions
@@ -13,7 +13,7 @@ export CACHE_QUEUE_PORT=8003
export HABANA_PROFILE=0
export HPU_VISIBLE_DEVICES=0
rm -rf log 2>/dev/null
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN \
ENABLE_V1_KVCACHE_SCHEDULER=1 FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN \
python -m fastdeploy.entrypoints.openai.api_server \
--model ERNIE-4.5-21B-A3B-Paddle \
--port ${SERVER_PORT} \
@@ -32,7 +32,7 @@ FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WA
# (2k + 1k) / 128(block_size) * 128(batch) = 3072
# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# rm -rf log 2>/dev/null
# FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=HPU_ATTN \
# ENABLE_V1_KVCACHE_SCHEDULER=1 FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=HPU_ATTN \
# python -m fastdeploy.entrypoints.openai.api_server \
# --model ERNIE-4.5-300B-A47B-Paddle \
# --port ${SERVER_PORT} \
@@ -27,7 +27,7 @@ else
fi
rm -rf log 2>/dev/null
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 FD_ATTENTION_BACKEND=HPU_ATTN ENABLE_V1_KVCACHE_SCHEDULER=0 \
ENABLE_V1_KVCACHE_SCHEDULER=1 FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 FD_ATTENTION_BACKEND=HPU_ATTN ENABLE_V1_KVCACHE_SCHEDULER=0 \
python -m fastdeploy.entrypoints.openai.api_server --model ${MODEL} --port ${SERVER_PORT} \
--engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} --metrics-port ${METRICS_PORT} \
--cache-queue-port ${CACHE_QUEUE_PORT} --tensor-parallel-size ${CARD_NUM} --max-model-len 16384 \