[XPU] [CI] Fix xpu ci bug (#7014)

* fix xpu ci bug

* Remove unnecessary blank line in conftest.py

* Update upload-artifact action to version 6

* Update _xpu_8cards_case_test.yml

* fix ci bug

* Change exit code on test failure to 1

* fix ci bug

* fix ci bug

* fix ci bug

* fix ci bug

* Update conftest.py
This commit is contained in:
Jiaxin Sui
2026-03-27 10:29:34 +08:00
committed by GitHub
parent a31d4bfbdf
commit c3ed7db28d
7 changed files with 77 additions and 4 deletions
@@ -193,13 +193,29 @@ jobs:
echo "============================开始运行pytest测试============================"
export PYTHONPATH=/workspace/FastDeploy/
export PYTHONPATH=$(pwd)/tests/xpu_ci:$PYTHONPATH
mkdir -p case_logs
set +e
python -m pytest -v -s --tb=short tests/xpu_ci/4cards_cases/
exit_code=$?
set -e
# 修改case_logs权限,确保Docker外部的runner用户可以读取并上传
chmod -R a+rX case_logs/ 2>/dev/null || true
if [ $exit_code -eq 0 ]; then
echo "============================4卡cases测试通过!============================"
exit $exit_code
else
echo "============================4卡cases测试失败,请检查日志!============================"
exit $exit_code
fi
'
- name: Upload case logs
if: always()
uses: actions/upload-artifact@v6
with:
name: xpu-4cards-case-logs
path: FastDeploy/case_logs/
retention-days: 7
if-no-files-found: ignore
@@ -182,8 +182,14 @@ jobs:
echo "============================开始运行pytest测试============================"
export PYTHONPATH=/workspace/FastDeploy/
export PYTHONPATH=$(pwd)/tests/xpu_ci:$PYTHONPATH
mkdir -p case_logs
set +e
python -m pytest -v -s --tb=short tests/xpu_ci/8cards_cases/
exit_code=$?
set -e
# 修改case_logs权限,确保Docker外部的runner用户可以读取并上传
chmod -R a+rX case_logs/ 2>/dev/null || true
if [ $exit_code -eq 0 ]; then
echo "============================8卡cases测试通过!============================"
@@ -192,3 +198,12 @@ jobs:
exit $exit_code
fi
'
- name: Upload case logs
if: always()
uses: actions/upload-artifact@v6
with:
name: xpu-8cards-case-logs
path: FastDeploy/case_logs/
retention-days: 7
if-no-files-found: ignore
@@ -109,7 +109,7 @@ def print_pd_logs_on_failure():
log_dirs = ["log_router", "log_prefill", "log_decode"]
for log_dir in log_dirs:
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
if os.path.exists(nohup_path):
print(f"\n========== {nohup_path} ==========")
with open(nohup_path, "r") as f:
@@ -109,7 +109,7 @@ def print_pd_logs_on_failure():
log_dirs = ["log_router", "log_prefill", "log_decode"]
for log_dir in log_dirs:
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
if os.path.exists(nohup_path):
print(f"\n========== {nohup_path} ==========")
with open(nohup_path, "r") as f:
@@ -109,7 +109,7 @@ def print_pd_logs_on_failure():
log_dirs = ["log_router", "log_prefill", "log_decode"]
for log_dir in log_dirs:
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
if os.path.exists(nohup_path):
print(f"\n========== {nohup_path} ==========")
with open(nohup_path, "r") as f:
@@ -110,7 +110,7 @@ def print_pd_logs_on_failure():
log_dirs = ["log_router", "log_prefill", "log_decode"]
for log_dir in log_dirs:
nohup_path = os.path.join(log_dir, "log_0/worklog.0")
nohup_path = os.path.join(log_dir, "log_0/workerlog.0")
if os.path.exists(nohup_path):
print(f"\n========== {nohup_path} ==========")
with open(nohup_path, "r") as f:
+42
View File
@@ -23,6 +23,7 @@ XPU CI测试框架 - 通用配置和辅助函数
4. 环境配置 - 设置XPU相关环境变量
"""
import glob
import json
import os
import shutil
@@ -31,6 +32,8 @@ import time
import pytest
CASE_LOGS_DIR = os.path.join(os.getcwd(), "case_logs")
def get_xpu_id():
"""获取XPU_ID环境变量"""
@@ -457,3 +460,42 @@ def setup_logprobs_zmq_env():
os.environ[key] = value
print(f"设置环境变量: {key}={value}")
return original_values
# ============ 日志归档 pytest hook ============
def _archive_case_logs(test_name):
"""
将当前工作目录下所有 log 开头的文件夹和 server.log 复制到 case_logs/{test_name}/ 下
"""
dest_dir = os.path.join(CASE_LOGS_DIR, test_name)
os.makedirs(dest_dir, exist_ok=True)
# 复制所有 log* 目录
for entry in glob.glob("log*"):
if os.path.isdir(entry):
shutil.copytree(entry, os.path.join(dest_dir, entry), dirs_exist_ok=True)
elif os.path.isfile(entry):
# 处理 server.log 等 log 开头的文件
shutil.copy2(entry, os.path.join(dest_dir, entry))
# 单独处理 server.log(不以 log 开头但也是关键日志)
if os.path.exists("server.log") and not os.path.exists(os.path.join(dest_dir, "server.log")):
shutil.copy2("server.log", os.path.join(dest_dir, "server.log"))
@pytest.hookimpl(hookwrapper=True, trylast=True)
def pytest_runtest_makereport(item, call):
"""每个测试阶段结束后归档日志(仅在 call 阶段后执行)"""
outcome = yield
report = outcome.get_result()
if report.when == "call":
# 使用测试文件名(不含 .py)作为归档目录名
test_file = os.path.basename(item.fspath)
test_name = os.path.splitext(test_file)[0]
try:
_archive_case_logs(test_name)
except Exception:
pass