[Cherry-Pick][CI] Sync parallelism optimization from dev to 2.5 (#7085) (#7140)

* [Cherry-Pick][CI] Sync parallelism optimization from dev to 2.5 (#7085)
2026-04-23 00:17:25 +08:00 · 2026-04-02 14:21:47 +08:00
parent 566699303c
commit 7648164f6e
17 changed files with 375 additions and 90 deletions
@@ -182,7 +182,10 @@ jobs:
            docker rm -f ${runner_name} || true
          fi

-          docker run --rm --ipc=host --pid=host --net=host \
+          docker run --rm --net=host \
+          --shm-size=64g \
+          --sysctl kernel.msgmax=1048576 \
+          --sysctl kernel.msgmnb=268435456 \
          --name ${runner_name} \
          -v $(pwd):/workspace \
          -w /workspace \
@@ -166,7 +166,10 @@ jobs:
            docker rm -f ${runner_name} || true
          fi

-          docker run --rm --ipc=host --net=host \
+          docker run --rm --net=host \
+          --shm-size=64g \
+          --sysctl kernel.msgmax=1048576 \
+          --sysctl kernel.msgmnb=268435456 \
          --name ${runner_name} \
          -v $(pwd):/workspace -w /workspace \
          -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
@@ -152,7 +152,11 @@ jobs:
            echo "Removing stale container: ${runner_name}"
            docker rm -f ${runner_name} || true
          fi
-          docker run --rm --ipc=host --pid=host --net=host \
+
+          docker run --rm --net=host \
+          --shm-size=64g \
+          --sysctl kernel.msgmax=1048576 \
+          --sysctl kernel.msgmnb=268435456 \
          --name ${runner_name} \
          -v $(pwd):/workspace \
          -w /workspace \
@@ -163,6 +163,7 @@ jobs:
          fi

          docker run --rm --net=host \
+          --shm-size=64G \
          --name ${runner_name} \
          -v $(pwd):/workspace \
          -w /workspace \
@@ -160,6 +160,7 @@ jobs:
          fi

          docker run --rm --net=host \
+          --shm-size=64G \
          --name ${runner_name} \
          -v $(pwd):/workspace \
          -w /workspace \
@@ -47,6 +47,7 @@ jobs:
    outputs:
      all_cov_file_url: ${{ steps.cov_upload.outputs.all_cov_file_url }}
      unittest_failed_url: ${{ steps.cov_upload.outputs.unittest_failed_url }}
+      unittest_logs_url: ${{ steps.cov_upload.outputs.unittest_logs_url }}
      diff_cov_result_json_url: ${{ steps.cov_upload.outputs.diff_cov_result_json_url }}
    steps:
      - name: Code Prepare
@@ -173,6 +174,8 @@ jobs:
          export RDMA_DEVICES=$(find /dev/infiniband/uverbs* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}')

          docker run --rm --net=host \
+          --sysctl kernel.msgmax=1048576 \
+          --sysctl kernel.msgmnb=268435456 \
          --name ${runner_name} \
          --cap-add=SYS_PTRACE --cap-add=IPC_LOCK \
          --shm-size=64G \
@@ -309,6 +312,15 @@ jobs:
            echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_ENV
          fi

+          # Only upload logs when tests failed
+          unittest_logs_archive="unittest_logs.tar.gz"
+          if [ "$HAS_FAILED_TESTS" = true ]; then
+            python ${push_file} ${unittest_logs_archive} ${target_path}/UnitTestResult
+            UNIT_TEST_LOGS_URL=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/UnitTestResult/${unittest_logs_archive}
+            echo "unittest_logs_url=${UNIT_TEST_LOGS_URL}" >> $GITHUB_OUTPUT
+            echo "unittest_logs_url=${UNIT_TEST_LOGS_URL}" >> $GITHUB_ENV
+          fi
+
          if [[ "$IS_PR" != "true" ]]; then
            full_cov_file="full_coverage_report.txt"
            full_cov_csv="full_coverage_report.csv"
@@ -345,6 +357,7 @@ jobs:
            if [ -f "${filename}" ];then
              echo "Failed test cases:"
              cat "${filename}"
+              echo "unittest_logs_url=${unittest_logs_url}"
            fi
            exit "$TEST_EXIT_CODE"
          fi
@@ -7,73 +7,95 @@ run_path=$( realpath "$DIR/../")
 export COVERAGE_FILE=${COVERAGE_FILE:-$DIR/../coveragedata/.coverage}
 export COVERAGE_RCFILE=${COVERAGE_RCFILE:-$DIR/../scripts/.coveragerc}

+# ============================================================
+#  Classify tests into one of the following categories
+#  - multi_gpu: requires multiple GPUs / ports (run sequentially)
+#  - single_gpu: independent tests (can run in parallel)
+# ============================================================
+classify_tests() {
+    local test_file=$1
+    # Rule 1: distributed tests (explicit multi-GPU launch)
+    if [[ "$test_file" =~ tests/distributed/.*test_.*\.py ]]; then
+        echo "multi_gpu"
+        return
+    fi

-failed_tests_file="failed_tests.log"
-> "$failed_tests_file"
+    # Rule 2: e2e tests (usually involve service / ports)
+    if [[ "$test_file" =~ tests/e2e/.*test_.*\.py ]]; then
+        echo "multi_gpu"
+        return
+    fi

+    # Rule 3: model loader tests (allocate multiple GPUs)
+    if [[ "$test_file" =~ tests/model_loader/.*test_.*\.py ]]; then
+        echo "multi_gpu"
+        return
+    fi

-##################################
-# Run pytest, one file at a time
-# Use pytest's --collect-only output to extract the actual test file paths (e.g., tests/.../test_*.py).
-# Note: pytest may output lines like "ERROR tests/xxx/test_xxx.py::test_xxx ..." on collection failure,
-# to avoid treating prefixes like "ERROR"/"FAILED"/"collecting" as filenames,
-# we only keep the "tests/.../test_*.py" portion and discard everything else.
-TEST_FILES=$(
-  python -m pytest --collect-only -q -c "${PYTEST_INI}" "${tests_path}" --rootdir="${run_path}" --disable-warnings 2>&1 \
-    | grep -E 'tests/.+\/test_.*\.py' \
-    | sed -E 's@.*(tests/[^: ]*test_[^: ]*\.py).*@\1@' \
-    | sort -u
-)
+    # Rule 4: check file content for tensor_parallel_size=[234] or --tensor-parallel-size [234]
+    #    or CUDA_VISIBLE_DEVICES="0,1"
+    #    or PORT environment variables
+    if [ -f "$test_file" ]; then
+        if grep -q '"tensor_parallel_size".*[1234]\|--tensor-parallel-size.*[1234]\|tensor_parallel_size.*=[1234]\|CUDA_VISIBLE_DEVICES.*0.*1\|paddle\.distributed\.launch.*--gpus.*0.*1\|FD_API_PORT\|FLASK_PORT\|FD_ENGINE_QUEUE_PORT\|FD_METRICS_PORT\|FD_CACHE_QUEUE_PORT\|FD_ROUTER_PORT\|FD_CONNECTOR_PORT\|FD_RDMA_PORT' "$test_file" 2>/dev/null; then
+            echo "multi_gpu"
+            return
+        fi
+    fi

+    # ========== Single-GPU tests (no port required, can run in parallel) ==========
+    echo "single_gpu"
+}

-failed_pytest=0
-success_pytest=0
+# ============================================================
+# Run Test With Logging
+# ============================================================
+run_test_with_logging() {
+    local test_file=$1
+    local log_prefix=$2
+    local status

-# nullglob: if no match, the pattern expands to nothing
-shopt -s nullglob
+    echo "Running pytest file: $test_file"

-for file in $TEST_FILES; do
-    echo "Running pytest file: $file"
-    # Clean up previous logs
-    rm -rf "${run_path}"/log* || true
-    for f in "${run_path}"/*.log; do
-        [[ "$(basename "$f")" != "${failed_tests_file}" ]] && rm -f "$f"
-    done
+    # Create isolated log directory for this test to avoid race conditions
+    # Format: unittest_logs/<test_dir>/<test_file_base>/log
+    local test_rel_path="${test_file#tests/}"
+    local test_dir=$(dirname "$test_rel_path")
+    local test_name=$(basename "$test_file" .py)
+    local isolated_log_dir="${run_path}/unittest_logs/${test_dir}/${test_name}/log"
+    mkdir -p "$isolated_log_dir"

-    # Run pytest with coverage for the current file
-    # Set timeout to 600 seconds to avoid infinite loop
-    timeout 600 python -m coverage run -m pytest -c ${PYTEST_INI} "$file" -vv -s
+    # Set FD_LOG_DIR to isolate logs for each test
+    export FD_LOG_DIR="$isolated_log_dir"
+
+    # Run test
+    timeout 600 python -m coverage run -m pytest -c ${PYTEST_INI} "$test_file" -vv -s
    status=$?
+
    if [ "$status" -ne 0 ]; then
-        echo "$file" >> "$failed_tests_file"
-        failed_pytest=$((failed_pytest+1))
-
+        echo "$test_file" >> "$log_prefix"
        echo ""
-        echo "==================== Dumping Logs ===================="
+        echo "==================== Test Failed: $test_file ===================="

-        for log_dir in "${run_path}"/log*; do
-            if [ -d "${log_dir}" ]; then
-                echo
-                echo ">>>> Processing log directory: ${log_dir}"
+        # Use isolated log directory for this test
+        if [ -d "$isolated_log_dir" ]; then
+            echo
+            echo ">>>> Processing log directory: ${isolated_log_dir}"

-                # print all workerlog.0
-                worker_logs=("${log_dir}"/workerlog.0)
-                if [ "${#worker_logs[@]}" -gt 0 ]; then
-                    for worker_log in "${worker_logs[@]}"; do
-                        if [ -f "${worker_log}" ]; then
-                            echo "---------------- ${worker_log} (last 100 lines) ----------------"
-                            tail -n 100 "${worker_log}" || true
-                            echo "---------------------------------------------------------------"
-                        fi
-                    done
-                else
-                    echo "No workerlog.0 found in ${log_dir}"
-                fi
+            # workerlog
+            worker_logs=("${isolated_log_dir}"/workerlog.0)

-                echo ">>> grep error in ${log_dir}"
-                grep -Rni --color=auto "error" "${log_dir}" || true
+            if [ -f "${worker_logs[0]}" ]; then
+                for worker_log in "${worker_logs[@]}"; do
+                    [ -f "${worker_log}" ] || continue
+                    echo "---------------- ${worker_log} (last 100 lines) ----------------"
+                    tail -n 100 "${worker_log}" || true
+                    echo "---------------------------------------------------------------"
+                done
            fi
-        done
+
+            echo ">>> grep error in ${isolated_log_dir}"
+            grep -Rni --color=auto "error" "${isolated_log_dir}" || true
+        fi

        # print all server logs
        server_logs=("${run_path}"/*.log)
@@ -92,28 +114,251 @@ for file in $TEST_FILES; do
            echo "No *.log files found"
        fi

-        echo "======================================================"
-    else
-        success_pytest=$((success_pytest+1))
+        echo "======================================================="
    fi
-    ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9
-    ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9
-done
-shopt -u nullglob

-##################################
-# Summary
-##################################
+    # Clean up port-related processes
+    if [ -n "$FD_CACHE_QUEUE_PORT" ]; then
+        ps -ef | grep "${FD_CACHE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true
+    fi
+    if [ -n "$FD_ENGINE_QUEUE_PORT" ]; then
+        ps -ef | grep "${FD_ENGINE_QUEUE_PORT}" | grep -v grep | awk '{print $2}' | xargs -r kill -9 || true
+    fi
+
+    # if passed, remove the isolated log directory and server logs
+    if [ "$status" -eq 0 ]; then
+        rm -rf "${isolated_log_dir}" || true
+        # Clean up server logs in run_path on pass
+        for f in "${run_path}"/*.log; do
+            [[ "$(basename "$f")" != "${failed_tests_file}" ]] && rm -f "$f" || true
+        done
+    fi
+
+    # Unset FD_LOG_DIR to avoid affecting next test
+    unset FD_LOG_DIR
+    return $status
+}
+
+# ============================================================
+# Run a shard of tests on a dedicated GPU
+#   - one shard = one process = one GPU
+# ============================================================
+run_shard() {
+    local shard_name=$1
+    local gpu_id=$2
+    shift 2
+    local tests=("$@")
+
+    echo "===================================="
+    echo "Starting shard '${shard_name}' on GPU ${gpu_id}"
+    echo "Tests count: ${#tests[@]}"
+    echo "===================================="
+
+    # Set GPU
+    export CUDA_VISIBLE_DEVICES="$gpu_id"
+    export COVERAGE_FILE="${DIR}/../coveragedata/.coverage.${shard_name}"
+
+    # Failed log filename (no path, directly in project root)
+    local failed_log="${shard_name}_failed.txt"
+    rm -f "$failed_log"
+    > "$failed_log"
+
+    local success_count=0
+    local failed_count=0
+
+    for file in "${tests[@]}"; do
+        echo "[${shard_name}] Running: $file"
+
+        run_test_with_logging "$file" "$failed_log"
+        local status=$?
+
+        if [ "$status" -eq 0 ]; then
+            success_count=$((success_count + 1))
+        else
+            failed_count=$((failed_count + 1))
+        fi
+    done
+
+    unset COVERAGE_FILE
+
+    echo "===================================="
+    echo "Shard '${shard_name}' completed"
+    echo "Successful: $success_count"
+    echo "Failed: $failed_count"
+    echo "===================================="
+
+    unset CUDA_VISIBLE_DEVICES
+
+    return $failed_count
+}
+
+# ============================================================
+# Main Flow
+# ============================================================
+
+failed_tests_file="failed_tests.log"
+> "$failed_tests_file"
+
+echo "===================================="
+echo "Coverage Test Execution with Parallel Single-GPU Tests"
 echo "===================================="
-echo "Pytest total: $((failed_pytest + success_pytest))"
-echo "Pytest successful: $success_pytest"
-echo "Pytest failed: $failed_pytest"

+# ============================================================
+# Step 1: Collect & classify tests
+# ============================================================
+echo "Step 1: Collecting and classifying tests"

-if [ "$failed_pytest" -ne 0 ]; then
+ALL_TEST_FILES=$(
+    python -m pytest --collect-only -q -c "${PYTEST_INI}" "${tests_path}" --rootdir="${run_path}" --disable-warnings 2>&1 \
+    | grep -E 'tests/.+\/test_.*\.py' \
+    | sed -E 's@.*(tests/[^: ]*test_[^: ]*\.py).*@\1@' \
+    | sort -u
+)
+
+if [ -z "$ALL_TEST_FILES" ]; then
+    echo "ERROR: No test files found!"
+    exit 1
+fi
+
+MULTI_GPU_TESTS=()
+SINGLE_GPU_TESTS=()
+
+TOTAL_TESTS=0
+for file in $ALL_TEST_FILES; do
+    TOTAL_TESTS=$((TOTAL_TESTS + 1))
+    test_type=$(classify_tests "$file")
+
+    case "$test_type" in
+        "multi_gpu")
+            MULTI_GPU_TESTS+=("$file")
+            ;;
+        "single_gpu")
+            SINGLE_GPU_TESTS+=("$file")
+            ;;
+    esac
+done
+
+echo "Multi-GPU tests: ${#MULTI_GPU_TESTS[@]}"
+echo "Single-GPU tests: ${#SINGLE_GPU_TESTS[@]}"
+echo "Total tests: $TOTAL_TESTS"
+
+# ============================================================
+# Step 2: Run multi-GPU tests (sequential)
+# ============================================================
+echo "Step 2: Running multi-GPU tests"
+
+if [ ${#MULTI_GPU_TESTS[@]} -gt 0 ]; then
+    for file in "${MULTI_GPU_TESTS[@]}"; do
+        run_test_with_logging "$file" "$failed_tests_file"
+    done
+else
+    echo "No multi-GPU tests to run."
+fi
+
+# ============================================================
+# Step 3: Run single-GPU tests (parallel shards)
+# ============================================================
+echo "Step 3: Running single-GPU tests in parallel"
+
+if [ ${#SINGLE_GPU_TESTS[@]} -gt 0 ]; then
+    # Split single-GPU tests into 2 shards (1 per GPU)
+    TOTAL=${#SINGLE_GPU_TESTS[@]}
+    HALF=$(( TOTAL / 2 ))
+
+    SHARD_1=("${SINGLE_GPU_TESTS[@]:0:$HALF}")
+    SHARD_2=("${SINGLE_GPU_TESTS[@]:$HALF}")
+
+    echo "Shard 1: ${#SHARD_1[@]} tests on GPU 0"
+    echo "Shard 2: ${#SHARD_2[@]} tests on GPU 1"
+
+    # Run in parallel (1 process per GPU)
+    run_shard "shard1" 0 "${SHARD_1[@]}" &
+    PID1=$!
+    run_shard "shard2" 1 "${SHARD_2[@]}" &
+    PID2=$!
+
+    # Wait for all shards to complete
+    wait $PID1
+    EXIT_CODE1=$?
+    wait $PID2
+    EXIT_CODE2=$?
+
+    # Merge shard failed logs to main failed log
+    for shard in shard1 shard2; do
+        if [ -f "${shard}_failed.txt" ]; then
+            cat "${shard}_failed.txt" >> "$failed_tests_file"
+            rm -f "${shard}_failed.txt"
+        fi
+    done
+
+    echo ""
+    echo "===================================="
+    echo "Parallel execution completed"
+    echo "Shard 1 exit code: $EXIT_CODE1"
+    echo "Shard 2 exit code: $EXIT_CODE2"
+    echo "===================================="
+else
+    echo "No single-GPU tests to run."
+fi
+
+# ============================================================
+# Step 4: Summary
+# ============================================================
+echo "Step 4: Summary"
+
+# Count failed tests
+if [ -f "$failed_tests_file" ]; then
+    failed_count=$(wc -l < "$failed_tests_file" | tr -d ' ')
+else
+    failed_count=0
+fi
+
+success_count=$((TOTAL_TESTS - failed_count))
+
+echo "Pytest total: $TOTAL_TESTS"
+echo "Pytest successful: $success_count"
+echo "Pytest failed: $failed_count"
+
+echo "===================================="
+
+# Exit with error and package logs if there were failures
+if [ "$failed_count" -ne 0 ]; then
    echo "Failed test cases are listed in $failed_tests_file"
    cat "$failed_tests_file"
+
+    # clean the empty directories
+    if [ -d "${run_path}/unittest_logs" ]; then
+        echo "Cleaning empty directories..."
+
+        # remove console_error.log files (cleanup logs from stopped processes)
+        find "${run_path}/unittest_logs" -name "console_error.log*" -delete || true
+
+        # perform multi-round clean until no more empty directories are found
+        while true; do
+            before=$(find "${run_path}/unittest_logs" -type d | wc -l)
+            find "${run_path}/unittest_logs" -mindepth 1 -type d -empty -delete || true
+            after=$(find "${run_path}/unittest_logs" -type d | wc -l)
+            [ "$before" -eq "$after" ] && break
+        done
+    fi
+
+    # Only package logs when there are failures
+    echo "===================================="
+    echo "Step 5: Packaging logs (only on failure)"
+    echo "===================================="
+
+    if [ -d "${run_path}/unittest_logs" ]; then
+        tar -czf "${run_path}/unittest_logs.tar.gz" -C "${run_path}" unittest_logs
+        echo "Logs packaged to: ${run_path}/unittest_logs.tar.gz"
+        ls -lh "${run_path}/unittest_logs.tar.gz"
+    else
+        echo "No unittest_logs directory found."
+    fi
+
+    echo "===================================="
+
    exit 8
 fi

 echo "All tests passed!"
+exit 0
@@ -1389,7 +1389,8 @@ def test_streaming_chat_finish_reason(openai_client):

 def test_profile_reset_block_num():
    """测试profile reset_block_num功能，与baseline diff不能超过5%"""
-    log_file = "./log/config.log"
+    log_dir = os.getenv("FD_LOG_DIR", "log")
+    log_file = os.path.join(log_dir, "config.log")
    baseline = 31446

    if not os.path.exists(log_file):
@@ -734,7 +734,8 @@ def test_chat_with_response_max_tokens(openai_client):

 def test_profile_reset_block_num():
    """测试profile reset_block_num功能，与baseline diff不能超过5%"""
-    log_file = "./log/config.log"
+    log_dir = os.getenv("FD_LOG_DIR", "log")
+    log_file = os.path.join(log_dir, "config.log")
    baseline = 40000

    if not os.path.exists(log_file):
@@ -612,7 +612,8 @@ def test_streaming(openai_client, capsys):

 def test_profile_reset_block_num():
    """测试profile reset_block_num功能，与baseline diff不能超过5%"""
-    log_file = "./log/config.log"
+    log_dir = os.getenv("FD_LOG_DIR", "log")
+    log_file = os.path.join(log_dir, "config.log")
    baseline = 32562

    if not os.path.exists(log_file):
@@ -430,7 +430,8 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):

 def test_profile_reset_block_num():
    """测试profile reset_block_num功能，与baseline diff不能超过15%"""
-    log_file = "./log/config.log"
+    log_dir = os.getenv("FD_LOG_DIR", "log")
+    log_file = os.path.join(log_dir, "config.log")
    baseline = 30000

    if not os.path.exists(log_file):
@@ -81,10 +81,12 @@ def setup_and_run_server():
        model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
    print(f"model_path: {model_path}")

+    base_log_dir = os.getenv("FD_LOG_DIR", "log")
+
    # router
    print("start router...")
    env_router = os.environ.copy()
-    env_router["FD_LOG_DIR"] = "log_router"
+    env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router")
    router_log_path = "router.log"

    router_cmd = [
@@ -110,7 +112,7 @@ def setup_and_run_server():
    env_prefill = os.environ.copy()
    env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
    env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
-    env_prefill["FD_LOG_DIR"] = "log_prefill"
+    env_prefill["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_prefill")
    prefill_log_path = "server_prefill.log"
    prefill_cmd = [
        sys.executable,
@@ -160,7 +162,7 @@ def setup_and_run_server():
    env_decode = os.environ.copy()
    env_decode["CUDA_VISIBLE_DEVICES"] = "1"
    env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
-    env_decode["FD_LOG_DIR"] = "log_decode"
+    env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode")
    decode_log_path = "server_decode.log"
    decode_cmd = [
        sys.executable,
@@ -81,10 +81,12 @@ def setup_and_run_server():
        model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
    print(f"model_path: {model_path}")

+    base_log_dir = os.getenv("FD_LOG_DIR", "log")
+
    # router
    print("start router...")
    env_router = os.environ.copy()
-    env_router["FD_LOG_DIR"] = "log_router"
+    env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router")
    router_log_path = "router.log"

    router_cmd = [
@@ -110,7 +112,7 @@ def setup_and_run_server():
    env_prefill = os.environ.copy()
    env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
    env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "1"
-    env_prefill["FD_LOG_DIR"] = "log_prefill"
+    env_prefill["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_prefill")
    prefill_log_path = "prefill.log"
    prefill_cmd = [
        sys.executable,
@@ -160,7 +162,7 @@ def setup_and_run_server():
    env_decode = os.environ.copy()
    env_decode["CUDA_VISIBLE_DEVICES"] = "1"
    env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "1"
-    env_decode["FD_LOG_DIR"] = "log_decode"
+    env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode")
    decode_log_path = "decode.log"
    decode_cmd = [
        sys.executable,
@@ -84,6 +84,8 @@ def setup_and_run_server():
        model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
    print(f"model_path: {model_path}")

+    base_log_dir = os.getenv("FD_LOG_DIR", "log")
+
    # get rdma nics
    current_dir = os.path.dirname(os.path.abspath(__file__))
    shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh")
@@ -94,7 +96,7 @@ def setup_and_run_server():
    # router
    print("start router...")
    env_router = os.environ.copy()
-    env_router["FD_LOG_DIR"] = "log_router"
+    env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router")
    router_log_path = "router.log"

    router_cmd = [
@@ -119,7 +121,7 @@ def setup_and_run_server():
    print("start prefill...")
    env_prefill = os.environ.copy()
    env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
-    env_prefill["FD_LOG_DIR"] = "log_prefill"
+    env_prefill["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_prefill")
    env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics

    prefill_log_path = "prefill.log"
@@ -166,7 +168,7 @@ def setup_and_run_server():
    print("start decode...")
    env_decode = os.environ.copy()
    env_decode["CUDA_VISIBLE_DEVICES"] = "1"
-    env_decode["FD_LOG_DIR"] = "log_decode"
+    env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode")
    env_decode["KVCACHE_RDMA_NICS"] = rdma_nics

    decode_log_path = "decode.log"
@@ -86,6 +86,8 @@ def setup_and_run_server():
        model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
    print(f"model_path: {model_path}")

+    base_log_dir = os.getenv("FD_LOG_DIR", "log")
+
    # get rdma nics
    current_dir = os.path.dirname(os.path.abspath(__file__))
    shell_path = os.path.join(current_dir, "utils/get_rdma_nics.sh")
@@ -96,7 +98,7 @@ def setup_and_run_server():
    # router
    print("start router...")
    env_router = os.environ.copy()
-    env_router["FD_LOG_DIR"] = "log_router"
+    env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router")
    router_log_path = "router.log"

    router_cmd = [
@@ -121,7 +123,7 @@ def setup_and_run_server():
    print("start prefill...")
    env_prefill = os.environ.copy()
    env_prefill["CUDA_VISIBLE_DEVICES"] = "0,1"
-    env_prefill["FD_LOG_DIR"] = "log_prefill"
+    env_prefill["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_prefill")
    env_prefill["KVCACHE_RDMA_NICS"] = rdma_nics

    prefill_log_path = "prefill.log"
@@ -170,7 +172,7 @@ def setup_and_run_server():
    print("start decode...")
    env_decode = os.environ.copy()
    env_decode["CUDA_VISIBLE_DEVICES"] = "1"
-    env_decode["FD_LOG_DIR"] = "log_decode"
+    env_decode["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_decode")
    env_decode["KVCACHE_RDMA_NICS"] = rdma_nics

    decode_log_path = "decode.log"
@@ -97,10 +97,12 @@ def setup_and_run_server():
        model_path = "baidu/ERNIE-4.5-0.3B-Paddle"
    print(f"model_path: {model_path}")

+    base_log_dir = os.getenv("FD_LOG_DIR", "log")
+
    # router
    print("start router...")
    env_router = os.environ.copy()
-    env_router["FD_LOG_DIR"] = "log_router"
+    env_router["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_router")
    router_log_path = "router.log"

    router_cmd = [
@@ -121,11 +123,11 @@ def setup_and_run_server():
        )

    # server0
-    print("start server0...")
+    print("start server 0...")
    env_server_0 = os.environ.copy()
    env_server_0["CUDA_VISIBLE_DEVICES"] = "0"
    env_server_0["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
-    env_server_0["FD_LOG_DIR"] = "log_server_0"
+    env_server_0["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_server_0")
    env_server_0["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT)
    log_path = "server_0.log"
    cmd = [
@@ -171,7 +173,7 @@ def setup_and_run_server():
    env_server_1["CUDA_VISIBLE_DEVICES"] = "1"
    env_server_1["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
    env_server_1["INFERENCE_MSG_QUEUE_ID"] = str(FD_API_PORT + 1)
-    env_server_1["FD_LOG_DIR"] = "log_server_1"
+    env_server_1["FD_LOG_DIR"] = os.path.join(base_log_dir, "log_server_1")
    log_path = "server_1.log"
    cmd = [
        sys.executable,
@@ -99,7 +99,8 @@ class TestMultiApiServer(unittest.TestCase):
        # Verify environment variables are set correctly
        first_call_kwargs = mock_popen.call_args_list[0][1]
        self.assertIn("env", first_call_kwargs)
-        self.assertEqual(first_call_kwargs["env"]["FD_LOG_DIR"], "log/log_0")
+        log_dir = os.getenv("FD_LOG_DIR", "log")
+        self.assertEqual(first_call_kwargs["env"]["FD_LOG_DIR"], os.path.join(log_dir, "log_0"))

    @patch("fastdeploy.entrypoints.openai.multi_api_server.is_port_available")
    def test_check_param_success(self, mock_is_port_available):