Files
FastDeploy/.github/workflows/_xpu_4cards_case_test.yml
T
Jiaxin Sui c3ed7db28d [XPU] [CI] Fix xpu ci bug (#7014)
* fix xpu ci bug

* Remove unnecessary blank line in conftest.py

* Update upload-artifact action to version 6

* Update _xpu_8cards_case_test.yml

* fix ci bug

* Change exit code on test failure to 1

* fix ci bug

* fix ci bug

* fix ci bug

* fix ci bug

* Update conftest.py
2026-03-27 10:29:34 +08:00

222 lines
8.2 KiB
YAML

name: xpu_4cards_case_test
on:
workflow_call:
inputs:
DOCKER_IMAGE:
description: "Build Images"
required: true
type: string
default: "ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:ci"
FASTDEPLOY_ARCHIVE_URL:
description: "URL of the compressed FastDeploy code archive."
required: true
type: string
FASTDEPLOY_WHEEL_URL:
description: "URL of the compressed FastDeploy whl ."
required: true
type: string
FD_VERSION:
description: "FastDeploy Package Version"
required: false
type: string
default: ""
PADDLEVERSION:
description: "Paddle Version Build Use"
required: false
type: string
default: ""
PADDLE_WHL_URL:
description: "Paddle Wheel Package URL"
required: false
type: string
default: ""
MODEL_PATH:
description: "MODEL Dir Use"
required: true
type: string
default: ""
secrets:
github-token:
required: true
jobs:
check_bypass:
uses: ./.github/workflows/check-bypass.yml
secrets:
github-token: ${{ secrets.github-token }}
with:
workflow-name: xpu_4cards_test
run_xpu_4cards_cases:
runs-on: [self-hosted, XPU-P800-4Cards]
needs: check_bypass
if: ${{ inputs.FASTDEPLOY_WHEEL_URL != '' && needs.check_bypass.outputs.can-skip != 'true' }}
timeout-minutes: 60
steps:
- name: Print current runner name
run: |
echo "Current runner name: ${{ runner.name }}"
- name: Code Prepare
shell: bash
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
model_path: ${{ inputs.MODEL_PATH }}
run: |
set -x
REPO="https://github.com/${{ github.repository }}.git"
FULL_REPO="${{ github.repository }}"
REPO_NAME="${FULL_REPO##*/}"
BASE_BRANCH="${{ github.base_ref }}"
docker pull ${docker_image} || true
# Clean the repository directory before starting
docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
-e "REPO_NAME=${REPO_NAME}" \
${docker_image} /bin/bash -c '
CLEAN_RETRIES=3
CLEAN_COUNT=0
while [ $CLEAN_COUNT -lt $CLEAN_RETRIES ]; do
echo "Attempt $((CLEAN_COUNT+1)) to remove ${REPO_NAME}* ..."
rm -rf "${REPO_NAME}"* || true
sleep 2
# Check if anything matching ${REPO_NAME}* still exists
if ! ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "All ${REPO_NAME}* removed successfully"
break
fi
CLEAN_COUNT=$((CLEAN_COUNT + 1))
done
if ls "${REPO_NAME}"* >/dev/null 2>&1; then
echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts"
ls -ld "${REPO_NAME}"*
exit 1
fi
'
wget -q --no-proxy ${fd_archive_url}
tar -xf FastDeploy.tar.gz
rm -rf FastDeploy.tar.gz
cd FastDeploy
git config --global user.name "FastDeployCI"
git config --global user.email "fastdeploy_ci@example.com"
git log -n 3 --oneline
- name: Run CI unittest
env:
docker_image: ${{ inputs.DOCKER_IMAGE }}
fd_archive_url: ${{ inputs.FASTDEPLOY_ARCHIVE_URL }}
fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }}
model_path: ${{ inputs.MODEL_PATH }}
run: |
runner_name="${{ runner.name }}"
last_char="${runner_name: -1}"
if [[ "$last_char" == "1" ]]; then
xpu_id="4"
else
xpu_id="0"
fi
PARENT_DIR=$(dirname "$WORKSPACE")
echo "PARENT_DIR:$PARENT_DIR"
docker run --rm --net=host --cap-add=SYS_PTRACE --privileged --shm-size=64G \
-v $(pwd):/workspace -w /workspace \
-v "/ssd3:/ssd3" \
-e "MODEL_PATH=${model_path}" \
-e "FASTDEPLOY_ARCHIVE_URL=${fd_archive_url}" \
-e "FASTDEPLOY_WHEEL_URL=${fd_wheel_url}" \
-e "PADDLEVERSION=${PADDLEVERSION}" \
-e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \
-e "http_proxy=$(git config --global --get http.proxy)" \
-e "https_proxy=$(git config --global --get https.proxy)" \
-e "no_proxy=bcebos.com,mirrors.tuna.tsinghua.edu.cn,127.0.0.1,localhost" \
-e "XPU_ID=${xpu_id}" \
${docker_image} /bin/bash -c '
echo "安装lsof工具..."
apt install -y lsof
# 设置XPU_VISIBLE_DEVICES
if [[ "$XPU_ID" == "0" ]]; then
export XPU_VISIBLE_DEVICES="0,1,2,3"
else
export XPU_VISIBLE_DEVICES="4,5,6,7"
fi
echo "XPU_VISIBLE_DEVICES=$XPU_VISIBLE_DEVICES"
# 下载和安装xre
echo "下载和安装xre..."
mkdir -p /workspace/deps
cd /workspace/deps
if [ ! -d "xre" ]; then
wget -q https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz
tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre
fi
cd -
export PATH=/workspace/deps/xre/bin:$PATH
# 重启XPU卡
echo "重启XPU卡..."
xpu-smi -r -i $XPU_VISIBLE_DEVICES
xpu-smi
set -e
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
python -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python -m pip install -r requirements.txt
echo "安装PaddlePaddle..."
# 针对不同分支和tag使用不同的PaddlePaddle安装包
if [[ "${PADDLE_WHL_URL}" != "" ]];then
python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y
python -m pip install ${PADDLE_WHL_URL}
elif [[ "${PADDLEVERSION}" != "" ]];then
python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y
python -m pip install paddlepaddle-xpu==${PADDLEVERSION} -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
else
python -m pip uninstall paddlepaddle-xpu fastdeploy-xpu -y
python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
fi
echo "安装上游任务编译的fastdeploy-xpu..."
python -m pip install ${FASTDEPLOY_WHEEL_URL}
rm -rf fastdeploy
python -m pip install ${FASTDEPLOY_WHEEL_URL} --no-deps --target=/workspace/FastDeploy
echo "============================安装测试依赖============================"
python -m pip install openai -U
python -m pip install pytest
python -m pip install pytest-timeout
unset http_proxy
unset https_proxy
echo "============================开始运行pytest测试============================"
export PYTHONPATH=/workspace/FastDeploy/
export PYTHONPATH=$(pwd)/tests/xpu_ci:$PYTHONPATH
mkdir -p case_logs
set +e
python -m pytest -v -s --tb=short tests/xpu_ci/4cards_cases/
exit_code=$?
set -e
# 修改case_logs权限,确保Docker外部的runner用户可以读取并上传
chmod -R a+rX case_logs/ 2>/dev/null || true
if [ $exit_code -eq 0 ]; then
echo "============================4卡cases测试通过!============================"
exit $exit_code
else
echo "============================4卡cases测试失败,请检查日志!============================"
exit $exit_code
fi
'
- name: Upload case logs
if: always()
uses: actions/upload-artifact@v6
with:
name: xpu-4cards-case-logs
path: FastDeploy/case_logs/
retention-days: 7
if-no-files-found: ignore