[Cherry-Pick][CI] Sync dev optimizations to 2.4(#7335) (#7346)

* [Cherry-Pick][CI] Sync dev optimizations to 2.4(#7335)
This commit is contained in:
YuBaoku
2026-04-12 20:21:17 +08:00
committed by GitHub
parent cdc5fce1b6
commit 19b0038234
17 changed files with 282 additions and 2126 deletions
+31 -5
View File
@@ -69,12 +69,27 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'
wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
wget -q --no-proxy ${fd_archive_url} || {
echo "ERROR: Failed to download archive from ${fd_archive_url}"
exit 1
}
tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
@@ -145,7 +160,10 @@ jobs:
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \
docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
@@ -160,10 +178,11 @@ jobs:
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
@@ -206,3 +225,10 @@ jobs:
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+27 -5
View File
@@ -81,7 +81,14 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'
@@ -111,7 +118,11 @@ jobs:
exit 1
fi
tar -xf FastDeploy.tar.gz
tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
@@ -182,7 +193,10 @@ jobs:
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \
docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
@@ -197,17 +211,18 @@ jobs:
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install ${fastdeploy_wheel_url}
python -m pip install pytest
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
wget --no-proxy https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
chmod +x ./llm-deploy-linux-amd64
./llm-deploy-linux-amd64 -python python3.10 \
-model_name ERNIE-4.5-0.3B-Paddle \
@@ -279,3 +294,10 @@ jobs:
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+12 -2
View File
@@ -120,6 +120,7 @@ jobs:
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: FastDeploy Build
shell: bash
env:
@@ -150,7 +151,8 @@ jobs:
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host \
--cap-add=SYS_PTRACE --privileged --shm-size=64G \
--cap-add=SYS_PTRACE --shm-size=64G \
--name ${runner_name} \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
@@ -164,6 +166,7 @@ jobs:
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
-e "BRANCH_REF=${BRANCH_REF}" \
-e "CCACHE_MAXSIZE=50G" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
if [[ -n "${FD_VERSION}" ]]; then
export FASTDEPLOY_VERSION=${FD_VERSION}
@@ -188,7 +191,7 @@ jobs:
else
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
fi
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
@@ -237,3 +240,10 @@ jobs:
target_path_stripped="${target_path#paddle-github-action/}"
WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name}
echo "wheel_path=${WHEEL_PATH}" >> $GITHUB_OUTPUT
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+15 -3
View File
@@ -8,7 +8,7 @@ on:
description: "Build Images"
required: true
type: string
default: "iregistry.baidu-int.com/tiangexiao/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-rc2"
default: "iregistry.baidu-int.com/new_rl_infra/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-v2.4.0-rc1"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
@@ -52,9 +52,10 @@ on:
wheel_path_rl:
description: "Output path of the generated wheel"
value: ${{ jobs.fd-build-rl.outputs.wheel_path_rl }}
jobs:
fd-build-rl:
runs-on: [self-hosted, GPU-Build]
runs-on: [self-hosted, GPU-Build-RL]
timeout-minutes: 360
outputs:
wheel_path_rl: ${{ steps.set_output.outputs.wheel_path_rl }}
@@ -107,6 +108,7 @@ jobs:
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: FastDeploy Build
shell: bash
env:
@@ -137,7 +139,8 @@ jobs:
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host \
--cap-add=SYS_PTRACE --privileged --shm-size=64G \
--cap-add=SYS_PTRACE --shm-size=64G \
--name ${runner_name} \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache_rl:/root/.cache" \
@@ -151,6 +154,7 @@ jobs:
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
-e "BRANCH_REF=${BRANCH_REF}" \
-e "CCACHE_MAXSIZE=50G" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c '
if [[ -n "${FD_VERSION}" ]]; then
export FASTDEPLOY_VERSION=${FD_VERSION}
@@ -162,6 +166,7 @@ jobs:
cd FastDeploy
# Avoid using pip cache to ensure the wheel is updated to the latest version
python -m pip uninstall paddlepaddle-gpu -y || true
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Paddle-RL-Compile/release/3.3/latest/paddlepaddle_gpu-3.3.0.dev-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu*
@@ -202,3 +207,10 @@ jobs:
target_path_stripped="${target_path#paddle-github-action/}"
WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name}
echo "wheel_path_rl=${WHEEL_PATH}" >> $GITHUB_OUTPUT
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+31 -6
View File
@@ -81,12 +81,27 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'
wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
wget -q --no-proxy ${fd_archive_url} || {
echo "ERROR: Failed to download archive from ${fd_archive_url}"
exit 1
}
tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
@@ -166,7 +181,10 @@ jobs:
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --net=host \
docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
@@ -183,6 +201,7 @@ jobs:
-e "fd_wheel_url=${fd_wheel_url}" \
-e "BASE_REF=${BASE_REF}" \
-e "IS_PR=${IS_PR}" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '
git config --global --add safe.directory /workspace/FastDeploy
@@ -191,8 +210,7 @@ jobs:
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install -r scripts/unittest_requirement.txt
@@ -204,3 +222,10 @@ jobs:
export CUDA_VISIBLE_DEVICES=0,1,2,3
bash scripts/run_gpu_4cards.sh
'
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+33 -5
View File
@@ -78,11 +78,27 @@ jobs:
if ls /workspace/* >/dev/null 2>&1; then
echo "ERROR: Failed to clean /workspace/* after multiple attempts"
ls -ld /workspace/*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls /workspace/* >/dev/null 2>&1; then
echo "ERROR: Force cleanup failed. Exiting..."
exit 1
else
echo "Force cleanup succeeded."
fi
fi
'
wget -q --no-proxy ${paddletest_archive_url}
tar -xf PaddleTest.tar.gz
wget -q --no-proxy ${paddletest_archive_url} || {
echo "ERROR: Failed to download archive from ${paddletest_archive_url}"
exit 1
}
tar --no-same-owner -xf PaddleTest.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}
rm -rf PaddleTest.tar.gz
cd PaddleTest
git config --global user.name "FastDeployCI"
@@ -152,7 +168,11 @@ jobs:
echo "Removing stale container: ${runner_name}"
docker rm -f ${runner_name} || true
fi
docker run --rm --ipc=host --pid=host --net=host \
docker run --rm --net=host \
--shm-size=64g \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
@@ -167,10 +187,11 @@ jobs:
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
@@ -221,3 +242,10 @@ jobs:
run: |
echo "logprob test failed with exit code ${{ env.LOGPROB_EXIT_CODE }}"
exit 8
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+28 -6
View File
@@ -83,12 +83,27 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'
wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
wget -q --no-proxy ${fd_archive_url} || {
echo "ERROR: Failed to download archive from ${fd_archive_url}"
exit 1
}
tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
@@ -163,6 +178,7 @@ jobs:
fi
docker run --rm --net=host \
--shm-size=64G \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
@@ -181,14 +197,20 @@ jobs:
-e "FD_ZMQ_SEND_RESPONSE_SERVER_PORT=${FD_ZMQ_SEND_RESPONSE_SERVER_PORT}" \
-e "FD_ZMQ_CONTROL_CMD_SERVER_PORTS=${FD_ZMQ_CONTROL_CMD_SERVER_PORTS}" \
-e "fd_wheel_url=${fd_wheel_url}" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
python -m pip install ${fd_wheel_url}
bash scripts/run_pre_ce.sh
'
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+30 -4
View File
@@ -81,12 +81,27 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'
wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
wget -q --no-proxy ${fd_archive_url} || {
echo "ERROR: Failed to download archive from ${fd_archive_url}"
exit 1
}
tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
@@ -160,6 +175,7 @@ jobs:
fi
docker run --rm --net=host \
--shm-size=64G \
--name ${runner_name} \
-v $(pwd):/workspace \
-w /workspace \
@@ -175,10 +191,11 @@ jobs:
-v "${CACHE_DIR}/.cache:/root/.cache" \
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
-e TZ="Asia/Shanghai" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc '
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
@@ -190,6 +207,7 @@ jobs:
TEST_EXIT_CODE=0
pushd tests/ce/stable_cases
bash launch_model.sh /MODELDATA
TEST_EXIT_CODE=0
bash run.sh || {
TEST_EXIT_CODE=1
@@ -211,6 +229,7 @@ jobs:
echo "======================================================="
}
popd
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env
'
@@ -220,3 +239,10 @@ jobs:
fi
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
+32 -5
View File
@@ -85,12 +85,27 @@ jobs:
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
echo "Attempting force cleanup with find..."
find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Force cleanup still failed"
exit 1
else
echo "Force cleanup succeeded"
fi
fi
'
wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
wget -q --no-proxy ${fd_archive_url} || {
echo "ERROR: Failed to download archive from ${fd_archive_url}"
exit 1
}
tar --no-same-owner -xf FastDeploy.tar.gz || {
echo "ERROR: Failed to extract archive"
exit 1
}
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
@@ -173,12 +188,16 @@ jobs:
export RDMA_DEVICES=$(find /dev/infiniband/uverbs* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')
docker run --rm --net=host \
--sysctl kernel.msgmax=1048576 \
--sysctl kernel.msgmnb=268435456 \
--name ${runner_name} \
--cap-add=SYS_PTRACE --cap-add=IPC_LOCK \
--shm-size=64G \
--shm-size=128G \
${RDMA_DEVICES} \
--device=/dev/infiniband/rdma_cm \
--ulimit memlock=-1:-1 \
--ulimit nofile=65536:65536 \
--ulimit nproc=8192:8192 \
-v $(pwd):/workspace -w /workspace \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \
@@ -198,6 +217,7 @@ jobs:
-e "fd_wheel_url=${fd_wheel_url}" \
-e "BASE_REF=${BASE_REF}" \
-e "IS_PR=${IS_PR}" \
-e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \
--gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c '
git config --global --add safe.directory /workspace/FastDeploy
@@ -205,7 +225,7 @@ jobs:
git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt
# Avoid using pip cache to ensure the wheel is updated to the latest version
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
python -m pip install -r scripts/unittest_requirement.txt
@@ -380,6 +400,13 @@ jobs:
echo "coverage passed"
exit 0
- name: Terminate and delete the container
if: always()
run: |
set +e
docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete'
docker rm -f ${{ runner.name }}
diff_coverage_report:
needs: run_tests_with_coverage
if: always()
@@ -0,0 +1,19 @@
name: PR Build and Test
on:
pull_request:
types: [closed]
branches: [develop, release/**]
permissions: read-all
concurrency:
group: ${{ github.event.pull_request.number }}-${{ github.workflow }}
cancel-in-progress: true
jobs:
cancel:
name: Cancel PR Build and Test for ${{ github.event.pull_request.number }}
runs-on: ubuntu-latest
steps:
- name: Cancel PR Build and Test
run: |
exit 0
-1
View File
@@ -4,7 +4,6 @@ on:
pull_request:
branches:
- develop
- 'release/*'
workflow_dispatch:
concurrency:
+5 -1
View File
@@ -7,7 +7,11 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p
python -m pip install -r requirements.txt
python -m pip install jsonschema aistudio_sdk==0.3.5
python -m pip install xgrammar==0.1.19 torch==2.6.0
# Use prebuilt wheel files to install xgrammar==0.1.19 and torch==2.6.0 specifically for the CI environment
python -m pip install \
https://paddle-qa.bj.bcebos.com/FastDeploy/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl \
https://paddle-qa.bj.bcebos.com/FastDeploy/triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl \
https://paddle-qa.bj.bcebos.com/FastDeploy/xgrammar-0.1.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
failed_files=()
run_path="$DIR/../tests/ci_use/"
+2 -2
View File
@@ -26,14 +26,14 @@ def build_request_payload(template_name: str, case_data: dict) -> dict:
return final_payload
def send_request(url, payload, timeout=600, stream=False):
def send_request(url, payload, timeout=60, stream=False):
"""
向指定URL发送POST请求,并返回响应结果。
Args:
url (str): 请求的目标URL。
payload (dict): 请求的负载数据,应该是一个字典类型。
timeout (int, optional): 请求的超时时间,默认为600秒。
timeout (int, optional): 请求的超时时间,默认为60秒。
stream (bool, optional): 是否以流的方式下载响应内容,默认为False。
Returns:
File diff suppressed because it is too large Load Diff
@@ -22,7 +22,6 @@ import time
import openai
import pytest
import requests
tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, tests_dir)
@@ -129,97 +128,6 @@ def setup_and_run_server():
print(f"Failed to terminate API server: {e}")
@pytest.fixture(scope="session")
def api_url(request):
"""
Returns the API endpoint URL for chat completions.
"""
return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions"
@pytest.fixture(scope="session")
def metrics_url(request):
"""
Returns the metrics endpoint URL.
"""
return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
@pytest.fixture
def headers():
"""
Returns common HTTP request headers.
"""
return {"Content-Type": "application/json"}
@pytest.fixture
def consistent_payload():
"""
Returns a fixed payload for consistency testing,
including a fixed random seed and temperature.
"""
return {
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0",
"detail": "high",
},
},
{"type": "text", "text": "请描述图片内容"},
],
}
],
"temperature": 0.8,
"top_p": 0, # fix top_p to reduce randomness
"seed": 13, # fixed random seed
}
# ==========================
# Consistency test for repeated runs with fixed payload
# ==========================
def test_consistency_between_runs(api_url, headers, consistent_payload):
"""
Test that result is same as the base result.
"""
# request
resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
assert resp1.status_code == 200
result1 = resp1.json()
content1 = (
result1["choices"][0]["message"]["reasoning_content"]
+ "</think>"
+ result1["choices"][0]["message"]["content"]
)
file_res_temp = "ernie-4_5-vl"
f_o = open(file_res_temp, "a")
f_o.writelines(content1)
f_o.close()
# base result
base_path = os.getenv("MODEL_PATH")
if base_path:
base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24-0130")
else:
base_file = "ernie-4_5-vl-base-tp2-24-0130"
with open(base_file, "r") as f:
content2 = f.read()
# Verify that result is same as the base result
assert content1 == content2
# ==========================
# OpenAI Client Chat Completion Test
# ==========================
@pytest.fixture
def openai_client():
ip = "0.0.0.0"
@@ -231,305 +139,9 @@ def openai_client():
return client
# Non-streaming test
def test_non_streaming_chat(openai_client):
"""Test non-streaming chat functionality with the local service"""
response = openai_client.chat.completions.create(
model="default",
messages=[
{
"role": "system",
"content": "You are a helpful AI assistant.",
}, # system不是必需,可选
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0",
"detail": "high",
},
},
{"type": "text", "text": "请描述图片内容"},
],
},
],
temperature=1,
max_tokens=53,
stream=False,
)
assert hasattr(response, "choices")
assert len(response.choices) > 0
assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, "content")
# Streaming test
def test_streaming_chat(openai_client, capsys):
"""Test streaming chat functionality with the local service"""
response = openai_client.chat.completions.create(
model="default",
messages=[
{
"role": "system",
"content": "You are a helpful AI assistant.",
}, # system不是必需,可选
{"role": "user", "content": "List 3 countries and their capitals."},
{
"role": "assistant",
"content": "China(Beijing), France(Paris), Australia(Canberra).",
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0",
"detail": "high",
},
},
{"type": "text", "text": "请描述图片内容"},
],
},
],
temperature=1,
max_tokens=512,
stream=True,
)
output = []
for chunk in response:
if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
output.append(chunk.choices[0].delta.content)
assert len(output) > 2
# ==========================
# OpenAI Client additional chat/completions test
# Helper functions for structured outputs testing
# ==========================
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in non-streaming chat functionality with the local service
"""
# 设定 return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high",
},
},
{"type": "text", "text": "请描述图片内容"},
],
},
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": True},
stream=False,
)
assert hasattr(response, "choices")
assert len(response.choices) > 0
assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, "prompt_token_ids")
assert isinstance(response.choices[0].message.prompt_token_ids, list)
assert hasattr(response.choices[0].message, "completion_token_ids")
assert isinstance(response.choices[0].message.completion_token_ids, list)
# 不设定 return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high",
},
},
{"type": "text", "text": "请描述图片内容"},
],
},
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": False},
stream=False,
)
assert hasattr(response, "choices")
assert len(response.choices) > 0
assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, "prompt_token_ids")
assert response.choices[0].message.prompt_token_ids is None
assert hasattr(response.choices[0].message, "completion_token_ids")
assert response.choices[0].message.completion_token_ids is None
def test_streaming_chat_with_return_token_ids(openai_client, capsys):
"""
Test return_token_ids option in streaming chat functionality with the local service
"""
# enable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high",
},
},
{"type": "text", "text": "请描述图片内容"},
],
},
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": True},
stream=True,
)
is_first_chunk = True
for chunk in response:
assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], "delta")
assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
assert hasattr(chunk.choices[0].delta, "completion_token_ids")
if is_first_chunk:
is_first_chunk = False
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
assert chunk.choices[0].delta.completion_token_ids is None
else:
assert chunk.choices[0].delta.prompt_token_ids is None
assert isinstance(chunk.choices[0].delta.completion_token_ids, list)
# disable return_token_ids
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"detail": "high",
},
},
{"type": "text", "text": "请描述图片内容"},
],
},
],
temperature=1,
max_tokens=53,
extra_body={"return_token_ids": False},
stream=True,
)
for chunk in response:
assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], "delta")
assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
assert chunk.choices[0].delta.prompt_token_ids is None
assert hasattr(chunk.choices[0].delta, "completion_token_ids")
assert chunk.choices[0].delta.completion_token_ids is None
def test_chat_with_thinking(openai_client, capsys):
"""
Test enable_thinking & reasoning_max_tokens option in non-streaming chat functionality with the local service
"""
# enable thinking, non-streaming
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
temperature=1,
stream=False,
max_tokens=10,
extra_body={"chat_template_kwargs": {"enable_thinking": True}},
)
assert response.choices[0].message.reasoning_content is not None
# disable thinking, non-streaming
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
temperature=1,
stream=False,
max_tokens=10,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
assert response.choices[0].message.reasoning_content is None
assert "</think>" not in response.choices[0].message.content
# test logic
reasoning_max_tokens = None
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
temperature=1,
stream=False,
max_tokens=20,
extra_body={
"chat_template_kwargs": {"enable_thinking": True},
"reasoning_max_tokens": reasoning_max_tokens,
},
)
assert response.choices[0].message.reasoning_content is not None
# enable thinking, streaming
reasoning_max_tokens = 3
response = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
temperature=1,
extra_body={
"chat_template_kwargs": {"enable_thinking": True},
"reasoning_max_tokens": reasoning_max_tokens,
"return_token_ids": True,
},
stream=True,
max_tokens=10,
)
completion_tokens = 0
reasoning_tokens = 0
total_tokens = 0
for chunk_id, chunk in enumerate(response):
if chunk_id == 0: # the first chunk is an extra chunk
continue
delta_message = chunk.choices[0].delta
if delta_message.reasoning_content != "" and delta_message.content == "":
reasoning_tokens += len(delta_message.completion_token_ids)
else:
completion_tokens += len(delta_message.completion_token_ids)
total_tokens += len(delta_message.completion_token_ids)
assert completion_tokens + reasoning_tokens == total_tokens
assert reasoning_tokens <= reasoning_max_tokens
def streaming_chat_base(openai_client, chat_param):
"""
Test streaming chat base functionality with the local service
@@ -571,6 +183,9 @@ def non_streaming_chat_base(openai_client, chat_param):
return response.choices[0].message.content
# ==========================
# Structured outputs tests
# ==========================
@pytest.mark.skip(reason="Temporarily skip this case due to unstable execution")
def test_structured_outputs_json_schema(openai_client):
"""
@@ -776,6 +391,7 @@ def test_structured_outputs_choice(openai_client):
"""
choice_param = {
"temperature": 1,
"top_p": 0.0,
"max_tokens": 1024,
"messages": [{"role": "user", "content": "What is the landmark building in Shenzhen?"}],
"extra_body": {
@@ -815,8 +431,6 @@ def test_structured_outputs_regex(openai_client):
"extra_body": {"guided_regex": r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n"},
}
import re
response = streaming_chat_base(openai_client, regex_param)
assert re.fullmatch(
r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response
@@ -855,6 +469,7 @@ def test_structured_outputs_grammar(openai_client):
grammar_param = {
"temperature": 1,
"top_p": 0.0,
"max_tokens": 1024,
"messages": [
{
@@ -865,96 +480,8 @@ def test_structured_outputs_grammar(openai_client):
"extra_body": {"guided_grammar": html_h1_grammar},
}
import re
pattern = r'^<h1( style="[^"]*")?>[A-Za-z0-9 ]+</h1>$'
response = streaming_chat_base(openai_client, grammar_param)
assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected"
response = non_streaming_chat_base(openai_client, grammar_param)
assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected"
def test_profile_reset_block_num():
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
log_file = "./log/config.log"
baseline = 40000
if not os.path.exists(log_file):
pytest.fail(f"Log file not found: {log_file}")
with open(log_file, "r") as f:
log_lines = f.readlines()
target_line = None
for line in log_lines:
if "Reset block num" in line:
target_line = line.strip()
break
if target_line is None:
pytest.fail("日志中没有Reset block num信息")
match = re.search(r"total_block_num:(\d+)", target_line)
if not match:
pytest.fail(f"Failed to extract total_block_num from line: {target_line}")
try:
actual_value = int(match.group(1))
except ValueError:
pytest.fail(f"Invalid number format: {match.group(1)}")
lower_bound = baseline * (1 - 0.05)
upper_bound = baseline * (1 + 0.05)
print(f"Reset total_block_num: {actual_value}. baseline: {baseline}")
assert lower_bound <= actual_value <= upper_bound, (
f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
)
def test_thinking_logic_flag(openai_client, capsys):
"""
Test the interaction between token calculation logic and conditional thinking.
This test covers:
1. Default max_tokens calculation when not provided.
2. Capping of max_tokens when it exceeds model limits.
3. Default reasoning_max_tokens calculation when not provided.
4. Activation of thinking based on the final state of reasoning_max_tokens.
"""
response_case_1 = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain gravity briefly."}],
temperature=1,
stream=False,
extra_body={
"chat_template_kwargs": {"enable_thinking": True},
},
)
assert response_case_1.choices[0].message.reasoning_content is not None
response_case_2 = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
temperature=1,
stream=False,
max_tokens=20,
extra_body={
"chat_template_kwargs": {"enable_thinking": True},
"reasoning_max_tokens": 5,
},
)
assert response_case_2.choices[0].message.reasoning_content is not None
response_case_3 = openai_client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
temperature=1,
stream=False,
max_tokens=20,
extra_body={
"chat_template_kwargs": {"enable_thinking": False},
},
)
assert response_case_3.choices[0].message.reasoning_content is None
@@ -1,647 +0,0 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import concurrent.futures
import json
import os
import re
import signal
import subprocess
import sys
import time
import openai
import pytest
import requests
from jsonschema import validate
tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, tests_dir)
from e2e.utils.serving_utils import (
FD_API_PORT,
FD_CACHE_QUEUE_PORT,
FD_ENGINE_QUEUE_PORT,
FD_METRICS_PORT,
clean_ports,
is_port_open,
)
@pytest.fixture(scope="session", autouse=True)
def setup_and_run_server():
"""
Pytest fixture that runs once per test session:
- Cleans ports before tests
- Starts the API server as a subprocess
- Waits for server port to open (up to 30 seconds)
- Tears down server after all tests finish
"""
print("Pre-test port cleanup...")
clean_ports()
base_path = os.getenv("MODEL_PATH")
if base_path:
model_path = os.path.join(base_path, "Qwen2-7B-Instruct")
else:
model_path = "./Qwen2-7B-Instruct"
log_path = "server.log"
cmd = [
sys.executable,
"-m",
"fastdeploy.entrypoints.openai.api_server",
"--model",
model_path,
"--port",
str(FD_API_PORT),
"--tensor-parallel-size",
"1",
"--engine-worker-queue-port",
str(FD_ENGINE_QUEUE_PORT),
"--metrics-port",
str(FD_METRICS_PORT),
"--cache-queue-port",
str(FD_CACHE_QUEUE_PORT),
"--max-model-len",
"32768",
"--max-num-seqs",
"128",
"--quantization",
"wint8",
]
# Start subprocess in new process group
with open(log_path, "w") as logfile:
process = subprocess.Popen(
cmd,
stdout=logfile,
stderr=subprocess.STDOUT,
start_new_session=True, # Enables killing full group via os.killpg
)
# Wait up to 300 seconds for API server to be ready
for _ in range(300):
if is_port_open("127.0.0.1", FD_API_PORT):
print(f"API server is up on port {FD_API_PORT}")
break
time.sleep(1)
else:
print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
try:
os.killpg(process.pid, signal.SIGTERM)
except Exception as e:
print(f"Failed to kill process group: {e}")
raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
yield # Run tests
print("\n===== Post-test server cleanup... =====")
try:
os.killpg(process.pid, signal.SIGTERM)
print(f"API server (pid={process.pid}) terminated")
except Exception as e:
print(f"Failed to terminate API server: {e}")
@pytest.fixture(scope="session")
def api_url(request):
"""
Returns the API endpoint URL for chat completions.
"""
return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions"
@pytest.fixture(scope="session")
def metrics_url(request):
"""
Returns the metrics endpoint URL.
"""
return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
@pytest.fixture
def headers():
"""
Returns common HTTP request headers.
"""
return {"Content-Type": "application/json"}
@pytest.fixture
def consistent_payload():
"""
Returns a fixed payload for consistency testing,
including a fixed random seed and temperature.
"""
return {
"messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}],
"temperature": 0.9,
"top_p": 0, # fix top_p to reduce randomness
"seed": 13, # fixed random seed
}
# ==========================
# JSON Schema for validating chat API responses
# ==========================
chat_response_schema = {
"type": "object",
"properties": {
"id": {"type": "string"},
"object": {"type": "string"},
"created": {"type": "number"},
"model": {"type": "string"},
"choices": {
"type": "array",
"items": {
"type": "object",
"properties": {
"message": {
"type": "object",
"properties": {
"role": {"type": "string"},
"content": {"type": "string"},
},
"required": ["role", "content"],
},
"index": {"type": "number"},
"finish_reason": {"type": "string"},
},
"required": ["message", "index", "finish_reason"],
},
},
},
"required": ["id", "object", "created", "model", "choices"],
}
# ==========================
# Helper function to calculate difference rate between two texts
# ==========================
def calculate_diff_rate(text1, text2):
"""
Calculate the difference rate between two strings
based on the normalized Levenshtein edit distance.
Returns a float in [0,1], where 0 means identical.
"""
if text1 == text2:
return 0.0
len1, len2 = len(text1), len(text2)
dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
for i in range(len1 + 1):
for j in range(len2 + 1):
if i == 0 or j == 0:
dp[i][j] = i + j
elif text1[i - 1] == text2[j - 1]:
dp[i][j] = dp[i - 1][j - 1]
else:
dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
edit_distance = dp[len1][len2]
max_len = max(len1, len2)
return edit_distance / max_len if max_len > 0 else 0.0
# ==========================
# Valid prompt test cases for parameterized testing
# ==========================
valid_prompts = [
[{"role": "user", "content": "你好"}],
[{"role": "user", "content": "用一句话介绍 FastDeploy"}],
]
@pytest.mark.parametrize("messages", valid_prompts)
def test_valid_chat(messages, api_url, headers):
"""
Test valid chat requests.
"""
resp = requests.post(api_url, headers=headers, json={"messages": messages})
assert resp.status_code == 200
validate(instance=resp.json(), schema=chat_response_schema)
# ==========================
# Consistency test for repeated runs with fixed payload
# ==========================
def test_consistency_between_runs(api_url, headers, consistent_payload):
"""
Test that two runs with the same fixed input produce similar outputs.
"""
# First request
resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
assert resp1.status_code == 200
result1 = resp1.json()
content1 = result1["choices"][0]["message"]["content"]
# Second request
resp2 = requests.post(api_url, headers=headers, json=consistent_payload)
assert resp2.status_code == 200
result2 = resp2.json()
content2 = result2["choices"][0]["message"]["content"]
# Calculate difference rate
diff_rate = calculate_diff_rate(content1, content2)
# Verify that the difference rate is below the threshold
assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})"
# ==========================
# Invalid prompt tests
# ==========================
invalid_prompts = [
[], # Empty array
[{}], # Empty object
[{"role": "user"}], # Missing content
[{"content": "hello"}], # Missing role
]
@pytest.mark.parametrize("messages", invalid_prompts)
def test_invalid_chat(messages, api_url, headers):
"""
Test invalid chat inputs
"""
resp = requests.post(api_url, headers=headers, json={"messages": messages})
assert resp.status_code >= 400, "Invalid request should return an error status code"
# ==========================
# Test for input exceeding context length
# ==========================
def test_exceed_context_length(api_url, headers):
"""
Test case for inputs that exceed the model's maximum context length.
"""
# Construct an overly long message
long_content = "你好," * 20000
messages = [{"role": "user", "content": long_content}]
resp = requests.post(api_url, headers=headers, json={"messages": messages})
# Check if the response indicates a token limit error or server error (500)
try:
response_json = resp.json()
except Exception:
response_json = {}
# Check status code and response content
assert (
resp.status_code != 200 or "token" in json.dumps(response_json).lower()
), f"Expected token limit error or similar, but got a normal response: {response_json}"
# ==========================
# Multi-turn Conversation Test
# ==========================
def test_multi_turn_conversation(api_url, headers):
"""
Test whether multi-turn conversation context is effective.
"""
messages = [
{"role": "user", "content": "你是谁?"},
{"role": "assistant", "content": "我是AI助手"},
{"role": "user", "content": "你能做什么?"},
]
resp = requests.post(api_url, headers=headers, json={"messages": messages})
assert resp.status_code == 200
validate(instance=resp.json(), schema=chat_response_schema)
# ==========================
# Concurrent Performance Test
# ==========================
def test_concurrent_perf(api_url, headers):
"""
Send concurrent requests to test stability and response time.
"""
prompts = [{"role": "user", "content": "Introduce FastDeploy."}]
def send_request():
"""
Send a single request
"""
resp = requests.post(api_url, headers=headers, json={"messages": prompts})
assert resp.status_code == 200
return resp.elapsed.total_seconds()
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
futures = [executor.submit(send_request) for _ in range(8)]
durations = [f.result() for f in futures]
print("\nResponse time for each request:", durations)
# ==========================
# Metrics Endpoint Test
# ==========================
def test_metrics_endpoint(metrics_url):
"""
Test the metrics monitoring endpoint.
"""
resp = requests.get(metrics_url, timeout=5)
assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}"
assert "text/plain" in resp.headers["Content-Type"], "Content-Type is not text/plain"
# Parse Prometheus metrics data
metrics_data = resp.text
lines = metrics_data.split("\n")
metric_lines = [line for line in lines if not line.startswith("#") and line.strip() != ""]
# 断言 具体值
num_requests_running_found = False
num_requests_waiting_found = False
time_to_first_token_seconds_sum_found = False
time_per_output_token_seconds_sum_found = False
e2e_request_latency_seconds_sum_found = False
request_inference_time_seconds_sum_found = False
request_queue_time_seconds_sum_found = False
request_prefill_time_seconds_sum_found = False
request_decode_time_seconds_sum_found = False
prompt_tokens_total_found = False
generation_tokens_total_found = False
request_prompt_tokens_sum_found = False
request_generation_tokens_sum_found = False
gpu_cache_usage_perc_found = False
request_params_max_tokens_sum_found = False
request_success_total_found = False
cache_config_info_found = False
available_batch_size_found = False
hit_req_rate_found = False
hit_token_rate_found = False
cpu_hit_token_rate_found = False
gpu_hit_token_rate_found = False
for line in metric_lines:
if line.startswith("fastdeploy:num_requests_running"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "num_requests_running 值错误"
num_requests_running_found = True
elif line.startswith("fastdeploy:num_requests_waiting"):
_, value = line.rsplit(" ", 1)
num_requests_waiting_found = True
assert float(value) >= 0, "num_requests_waiting 值错误"
elif line.startswith("fastdeploy:time_to_first_token_seconds_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "time_to_first_token_seconds_sum 值错误"
time_to_first_token_seconds_sum_found = True
elif line.startswith("fastdeploy:time_per_output_token_seconds_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "time_per_output_token_seconds_sum 值错误"
time_per_output_token_seconds_sum_found = True
elif line.startswith("fastdeploy:e2e_request_latency_seconds_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "e2e_request_latency_seconds_sum_found 值错误"
e2e_request_latency_seconds_sum_found = True
elif line.startswith("fastdeploy:request_inference_time_seconds_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_inference_time_seconds_sum 值错误"
request_inference_time_seconds_sum_found = True
elif line.startswith("fastdeploy:request_queue_time_seconds_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_queue_time_seconds_sum 值错误"
request_queue_time_seconds_sum_found = True
elif line.startswith("fastdeploy:request_prefill_time_seconds_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_prefill_time_seconds_sum 值错误"
request_prefill_time_seconds_sum_found = True
elif line.startswith("fastdeploy:request_decode_time_seconds_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_decode_time_seconds_sum 值错误"
request_decode_time_seconds_sum_found = True
elif line.startswith("fastdeploy:prompt_tokens_total"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "prompt_tokens_total 值错误"
prompt_tokens_total_found = True
elif line.startswith("fastdeploy:generation_tokens_total"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "generation_tokens_total 值错误"
generation_tokens_total_found = True
elif line.startswith("fastdeploy:request_prompt_tokens_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_prompt_tokens_sum 值错误"
request_prompt_tokens_sum_found = True
elif line.startswith("fastdeploy:request_generation_tokens_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_generation_tokens_sum 值错误"
request_generation_tokens_sum_found = True
elif line.startswith("fastdeploy:gpu_cache_usage_perc"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "gpu_cache_usage_perc 值错误"
gpu_cache_usage_perc_found = True
elif line.startswith("fastdeploy:request_params_max_tokens_sum"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_params_max_tokens_sum 值错误"
request_params_max_tokens_sum_found = True
elif line.startswith("fastdeploy:request_success_total"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "request_success_total 值错误"
request_success_total_found = True
elif line.startswith("fastdeploy:cache_config_info"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "cache_config_info 值错误"
cache_config_info_found = True
elif line.startswith("fastdeploy:available_batch_size"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "available_batch_size 值错误"
available_batch_size_found = True
elif line.startswith("fastdeploy:hit_req_rate"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "hit_req_rate 值错误"
hit_req_rate_found = True
elif line.startswith("fastdeploy:hit_token_rate"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "hit_token_rate 值错误"
hit_token_rate_found = True
elif line.startswith("fastdeploy:cpu_hit_token_rate"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "cpu_hit_token_rate 值错误"
cpu_hit_token_rate_found = True
elif line.startswith("fastdeploy:gpu_hit_token_rate"):
_, value = line.rsplit(" ", 1)
assert float(value) >= 0, "gpu_hit_token_rate 值错误"
gpu_hit_token_rate_found = True
assert num_requests_running_found, "缺少 fastdeploy:num_requests_running 指标"
assert num_requests_waiting_found, "缺少 fastdeploy:num_requests_waiting 指标"
assert time_to_first_token_seconds_sum_found, "缺少 fastdeploy:time_to_first_token_seconds_sum 指标"
assert time_per_output_token_seconds_sum_found, "缺少 fastdeploy:time_per_output_token_seconds_sum 指标"
assert e2e_request_latency_seconds_sum_found, "缺少 fastdeploy:e2e_request_latency_seconds_sum_found 指标"
assert request_inference_time_seconds_sum_found, "缺少 fastdeploy:request_inference_time_seconds_sum 指标"
assert request_queue_time_seconds_sum_found, "缺少 fastdeploy:request_queue_time_seconds_sum 指标"
assert request_prefill_time_seconds_sum_found, "缺少 fastdeploy:request_prefill_time_seconds_sum 指标"
assert request_decode_time_seconds_sum_found, "缺少 fastdeploy:request_decode_time_seconds_sum 指标"
assert prompt_tokens_total_found, "缺少 fastdeploy:prompt_tokens_total 指标"
assert generation_tokens_total_found, "缺少 fastdeploy:generation_tokens_total 指标"
assert request_prompt_tokens_sum_found, "缺少 fastdeploy:request_prompt_tokens_sum 指标"
assert request_generation_tokens_sum_found, "缺少 fastdeploy:request_generation_tokens_sum 指标"
assert gpu_cache_usage_perc_found, "缺少 fastdeploy:gpu_cache_usage_perc 指标"
assert request_params_max_tokens_sum_found, "缺少 fastdeploy:request_params_max_tokens_sum 指标"
assert request_success_total_found, "缺少 fastdeploy:request_success_total 指标"
assert cache_config_info_found, "缺少 fastdeploy:cache_config_info 指标"
assert available_batch_size_found, "缺少 fastdeploy:available_batch_size 指标"
assert hit_req_rate_found, "缺少 fastdeploy:hit_req_rate 指标"
assert hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标"
assert cpu_hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标"
assert gpu_hit_token_rate_found, "缺少 fastdeploy:gpu_hit_token_rate 指标"
# ==========================
# OpenAI Client chat.completions Test
# ==========================
@pytest.fixture
def openai_client():
ip = "0.0.0.0"
service_http_port = str(FD_API_PORT)
client = openai.Client(
base_url=f"http://{ip}:{service_http_port}/v1",
api_key="EMPTY_API_KEY",
)
return client
# Non-streaming test
def test_non_streaming_chat(openai_client):
"""Test non-streaming chat functionality with the local service"""
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=1,
max_tokens=1024,
stream=False,
)
assert hasattr(response, "choices")
assert len(response.choices) > 0
assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, "content")
# Streaming test
def test_streaming_chat(openai_client, capsys):
"""Test streaming chat functionality with the local service"""
response = openai_client.chat.completions.create(
model="default",
messages=[
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "List 3 countries and their capitals."},
{
"role": "assistant",
"content": "China(Beijing), France(Paris), Australia(Canberra).",
},
{"role": "user", "content": "OK, tell more."},
],
temperature=1,
max_tokens=1024,
stream=True,
)
output = []
for chunk in response:
if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"):
output.append(chunk.choices[0].delta.content)
assert len(output) > 2
# ==========================
# OpenAI Client completions Test
# ==========================
def test_non_streaming(openai_client):
"""Test non-streaming chat functionality with the local service"""
response = openai_client.completions.create(
model="default",
prompt="Hello, how are you?",
temperature=1,
max_tokens=1024,
stream=False,
)
# Assertions to check the response structure
assert hasattr(response, "choices")
assert len(response.choices) > 0
def test_streaming(openai_client, capsys):
"""Test streaming functionality with the local service"""
response = openai_client.completions.create(
model="default",
prompt="Hello, how are you?",
temperature=1,
max_tokens=1024,
stream=True,
)
# Collect streaming output
output = []
for chunk in response:
output.append(chunk.choices[0].text)
assert len(output) > 0
def test_profile_reset_block_num():
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
log_file = "./log/config.log"
baseline = 32562
if not os.path.exists(log_file):
pytest.fail(f"Log file not found: {log_file}")
with open(log_file, "r") as f:
log_lines = f.readlines()
target_line = None
for line in log_lines:
if "Reset block num" in line:
target_line = line.strip()
break
if target_line is None:
pytest.fail("日志中没有Reset block num信息")
match = re.search(r"total_block_num:(\d+)", target_line)
if not match:
pytest.fail(f"Failed to extract total_block_num from line: {target_line}")
try:
actual_value = int(match.group(1))
except ValueError:
pytest.fail(f"Invalid number format: {match.group(1)}")
lower_bound = baseline * (1 - 0.05)
upper_bound = baseline * (1 + 0.05)
print(f"Reset total_block_num: {actual_value}. baseline: {baseline}")
assert lower_bound <= actual_value <= upper_bound, (
f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
)
+6 -2
View File
@@ -1388,7 +1388,11 @@ class TestFastDeployBatch(unittest.TestCase):
clean_ports()
# 3. 确定模型路径
self.model_path = "baidu/ERNIE-4.5-0.3B-PT"
base_path = os.getenv("MODEL_PATH")
if base_path:
self.model_path = os.path.join(base_path, "ERNIE-4.5-0.3B-Paddle")
else:
self.model_path = "./ERNIE-4.5-0.3B-Paddle"
self.run_batch_command = [sys.executable, "fastdeploy/entrypoints/openai/run_batch.py"]
@@ -1520,7 +1524,7 @@ class TestFastDeployBatch(unittest.TestCase):
def test_completions(self):
"""测试正常的批量chat请求"""
return_code, contents, proc = self.run_fastdeploy_command(INPUT_BATCH, port="2235")
return_code, contents, proc = self.run_fastdeploy_command(INPUT_BATCH, port=str(FD_CACHE_QUEUE_PORT))
print(f"进程输出: {return_code}")
self.assertEqual(return_code, 0, f"进程返回非零码: {return_code}, 进程信息: {proc}")