mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[Intel HPU] enable kv cache scheduler v1 for hpu (#5648)
* [Intel HPU] enable kv cache scheduler v1 for hpu * fix copilt comments
This commit is contained in:
@@ -13,7 +13,7 @@ export CACHE_QUEUE_PORT=8003
|
||||
export HABANA_PROFILE=0
|
||||
export HPU_VISIBLE_DEVICES=0
|
||||
rm -rf log 2>/dev/null
|
||||
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||
ENABLE_V1_KVCACHE_SCHEDULER=1 FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ERNIE-4.5-21B-A3B-Paddle \
|
||||
--port ${SERVER_PORT} \
|
||||
@@ -32,7 +32,7 @@ FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WA
|
||||
# (2k + 1k) / 128(block_size) * 128(batch) = 3072
|
||||
# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
# rm -rf log 2>/dev/null
|
||||
# FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||
# ENABLE_V1_KVCACHE_SCHEDULER=1 FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||
# python -m fastdeploy.entrypoints.openai.api_server \
|
||||
# --model ERNIE-4.5-300B-A47B-Paddle \
|
||||
# --port ${SERVER_PORT} \
|
||||
|
||||
@@ -27,7 +27,7 @@ else
|
||||
fi
|
||||
|
||||
rm -rf log 2>/dev/null
|
||||
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 FD_ATTENTION_BACKEND=HPU_ATTN ENABLE_V1_KVCACHE_SCHEDULER=0 \
|
||||
ENABLE_V1_KVCACHE_SCHEDULER=1 FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 FD_ATTENTION_BACKEND=HPU_ATTN ENABLE_V1_KVCACHE_SCHEDULER=0 \
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model ${MODEL} --port ${SERVER_PORT} \
|
||||
--engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} --metrics-port ${METRICS_PORT} \
|
||||
--cache-queue-port ${CACHE_QUEUE_PORT} --tensor-parallel-size ${CARD_NUM} --max-model-len 16384 \
|
||||
|
||||
Reference in New Issue
Block a user