From 1e08ee74e577f4988d0c06ece4310bbff7e2050e Mon Sep 17 00:00:00 2001 From: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Date: Mon, 13 Apr 2026 20:23:49 +0800 Subject: [PATCH] [CI] Modify 4-card container startup config and move test case (#7363) --- .github/workflows/_gpu_4cards_case_test.yml | 12 +++++++++++- .../test_ernie_03b_pd_router_v1_rdma_tp2.py | 6 +++--- 2 files changed, 14 insertions(+), 4 deletions(-) rename tests/e2e/{ => 4cards_cases}/test_ernie_03b_pd_router_v1_rdma_tp2.py (98%) diff --git a/.github/workflows/_gpu_4cards_case_test.yml b/.github/workflows/_gpu_4cards_case_test.yml index 5c9a51aa80..797fddb775 100644 --- a/.github/workflows/_gpu_4cards_case_test.yml +++ b/.github/workflows/_gpu_4cards_case_test.yml @@ -181,11 +181,17 @@ jobs: docker rm -f ${runner_name} || true fi + export RDMA_DEVICES=$(find /dev/infiniband/uverbs* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}') + docker run --rm --net=host \ - --shm-size=64g \ --sysctl kernel.msgmax=1048576 \ --sysctl kernel.msgmnb=268435456 \ --name ${runner_name} \ + --cap-add=SYS_PTRACE --cap-add=IPC_LOCK \ + --shm-size=64G \ + ${RDMA_DEVICES} \ + --device=/dev/infiniband/rdma_cm \ + --ulimit memlock=-1:-1 \ -v $(pwd):/workspace -w /workspace \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache:/root/.cache" \ @@ -197,6 +203,10 @@ jobs: -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \ -e "FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT}" \ + -e "FD_ROUTER_PORT=${FD_ROUTER_PORT}" \ + -e "FD_CONNECTOR_PORT=${FD_CONNECTOR_PORT}" \ + -e "FD_RDMA_PORT=${FD_RDMA_PORT}" \ + -e "CLEAN_CUDA=1" \ -e TZ="Asia/Shanghai" \ -e "fd_wheel_url=${fd_wheel_url}" \ -e "BASE_REF=${BASE_REF}" \ diff --git a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py b/tests/e2e/4cards_cases/test_ernie_03b_pd_router_v1_rdma_tp2.py similarity index 98% rename from tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py rename to tests/e2e/4cards_cases/test_ernie_03b_pd_router_v1_rdma_tp2.py index 0bbc8186a5..85858da1dc 100644 --- a/tests/e2e/test_ernie_03b_pd_router_v1_rdma_tp2.py +++ b/tests/e2e/4cards_cases/test_ernie_03b_pd_router_v1_rdma_tp2.py @@ -26,7 +26,7 @@ import time import pytest import requests -from utils.serving_utils import ( +from e2e.utils.serving_utils import ( FD_API_PORT, FD_CACHE_QUEUE_PORT, FD_ENGINE_QUEUE_PORT, @@ -90,7 +90,7 @@ def setup_and_run_server(): # get rdma nics current_dir = os.path.dirname(os.path.abspath(__file__)) - shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh") + shell_path = os.path.join(current_dir, "../utils/get_rdma_nics.sh") output = subprocess.check_output(["bash", shell_path, "gpu"], text=True) _, rdma_nics = output.split("=") print(f"shell_path: {shell_path}, rdma_nics: {rdma_nics}") @@ -171,7 +171,7 @@ def setup_and_run_server(): # decode实例 print("start decode...") env_decode = os.environ.copy() - env_decode["CUDA_VISIBLE_DEVICES"] = "1" + env_decode["CUDA_VISIBLE_DEVICES"] = "2" env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode") # env_decode["KVCACHE_RDMA_NICS"] = rdma_nics