diff --git a/together_runner/slurm-disagg/00_setup.sh b/together_runner/slurm-disagg/00_setup.sh new file mode 100755 index 000000000..c23bbbc2a --- /dev/null +++ b/together_runner/slurm-disagg/00_setup.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# One-time (per pod boot) setup: grab the 2-node allocation, apply ephemeral node fixes, +# import the image, prestage weights. Idempotent — safe to re-run (reuses a live +# allocation). Needs sudo for the enroot nvidia-hook patch. +# +# bash 00_setup.sh +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$HERE/config.env"; source "$HERE/disagg_lib.sh" +load_resolved # reuse a persisted live allocation if present +ensure_allocation || exit 1 # ALLOC_JOB + PREFILL_NODE/DECODE_NODE now set +STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL" + +# 1) Patch the enroot nvidia hook on BOTH nodes (skip the persistenced/fabricmanager +# sockets that can't be bind-mounted inside the nested pod). Ephemeral; redo per boot. +# NOTE: this sed-patch of the *system* hook is unavoidable on this stack — a clean +# user-level override is not possible here (verified on enroot 4.0.1): runtime.sh runs +# the system AND user hooks.d with no basename dedup, so a user copy can't replace the +# system 98-nvidia.sh; and pyxis ignores a per-job ENROOT_SYSCONF_PATH redirect. +trlog "patching enroot nvidia hook on $PREFILL_NODE,$DECODE_NODE ..." +$STEP -N2 --ntasks-per-node=1 -w "$PREFILL_NODE,$DECODE_NODE" bash -c ' + set -e; H=$(hostname); F=/etc/enroot/hooks.d/98-nvidia.sh + if grep -q "no-persistenced" "$F"; then echo "[$H] hook already patched"; exit 0; fi + sudo -n true 2>/dev/null || { echo "[$H] ERROR: sudo unavailable — cannot patch $F"; exit 1; } + sudo cp "$F" "$F.bak" + sudo sed -i "s|cli_args=(\"--no-cgroups\" |cli_args=(\"--no-cgroups\" \"--no-persistenced\" \"--no-fabricmanager\" |" "$F" + if grep -q "no-persistenced" "$F"; then echo "[$H] hook patched"; else + echo "[$H] ERROR: patch did not take (hook format changed?) — restoring backup"; sudo cp "$F.bak" "$F"; exit 1 + fi' + +# 2) Import the SGLang image to the shared squashfs. enroot import needs a node-local +# NON-overlay fs for its temp (overlay / can't mknod the overlayfs whiteouts) — the +# step auto-detects one (honors $ENROOT_SCRATCH if set). +if [[ -f "$SQSH" ]]; then + trlog "image already imported: $SQSH" +else + trlog "importing $DOCKER_IMAGE -> $SQSH (auto-detect ext4 temp; multi-GB, ~minutes) ..." + mkdir -p "$ENROOT_DIR" + $STEP -N1 -w "$PREFILL_NODE" bash -c ' + set -e + S="" + for c in "$ENROOT_SCRATCH" /scratch /raid /mnt/local /mnt/resource /var/tmp /tmp; do + [ -n "$c" ] || continue + [ -d "$c" ] || mkdir -p "$c" 2>/dev/null || continue + t=$(stat -f -c %T "$c" 2>/dev/null) + case "$t" in overlayfs|overlay|tmpfs|"") continue;; esac + mkdir -p "$c/enroot" 2>/dev/null || continue + S="$c/enroot"; break + done + [ -n "$S" ] || { echo "[$(hostname)] ERROR: no node-local non-overlay scratch (set ENROOT_SCRATCH)"; exit 1; } + export ENROOT_CACHE_PATH="$S/cache" ENROOT_TEMP_PATH="$S/tmp" TMPDIR="$S/tmp" + mkdir -p "$ENROOT_CACHE_PATH" "$ENROOT_TEMP_PATH" + echo "[$(hostname)] enroot temp on $S ($t)" + enroot import -o "$SQSH" "$DOCKER_IMAGE"' +fi + +# 3) Prestage weights to the shared FS (zero-download launches). +if [[ -f "$MODEL_DIR/config.json" ]]; then + trlog "weights present: $MODEL_DIR" +else + trlog "downloading $MODEL_HF_ID -> $MODEL_DIR ..." + mkdir -p "$MODEL_DIR" + $STEP -N1 -w "$DECODE_NODE" \ + --container-image="$SQSH" \ + --container-mounts="$HF_CACHE:/root/.cache/huggingface,$MODELS_ROOT:$MODELS_ROOT" \ + bash -c 'export HF_TOKEN=$(cat /root/.cache/huggingface/token 2>/dev/null) + hf download "$MODEL_HF_ID" --local-dir "$MODEL_DIR"' +fi +trlog "setup complete. (allocation $ALLOC_JOB held; next: bash 01_preflight.sh)" diff --git a/together_runner/slurm-disagg/01_preflight.sh b/together_runner/slurm-disagg/01_preflight.sh new file mode 100755 index 000000000..5ef2892ca --- /dev/null +++ b/together_runner/slurm-disagg/01_preflight.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# Preflight: auto-detect + verify the RDMA/KV path BEFORE launching servers, so a +# new cluster fails in seconds with a clear reason instead of after a multi-minute +# server startup. Resolves IB_DEVICES (GPU<->NIC topology) and the +# WITH_NVIDIA_PEERMEM decision (peermem vs dmabuf), checks IB port state on both +# nodes, and verifies the /dev/infiniband bind-mount + dmabuf inside the container. +# Writes the resolved values to $LOG_DIR/disagg_detected.env (sourced by 10_launch.sh). +# +# bash 01_preflight.sh # detect + check, write detected.env +# PROBE_MOONCAKE=1 bash 01_preflight.sh # + heavy mooncake register_memory probe +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$HERE/config.env"; source "$HERE/disagg_lib.sh" +load_nodes # nodes/ALLOC_JOB only — re-derive IB/peermem from clean config each run +alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; } +DETECTED="$LOG_DIR/disagg_detected.env" +SR="srun --jobid=$ALLOC_JOB --overlap --export=ALL -t 5" +fail=0 + +# --- 1. IB ports ACTIVE/LinkUp on BOTH nodes (host-side, fast) --- +trlog "checking IB port state on $PREFILL_NODE,$DECODE_NODE ..." +$SR -N2 --ntasks-per-node=1 --nodelist="$PREFILL_NODE,$DECODE_NODE" bash -c ' + n=0; act=0 + for d in /sys/class/infiniband/*; do + [ -e "$d/ports/1/state" ] || continue; n=$((n+1)) + s=$(cat "$d/ports/1/state"); p=$(cat "$d/ports/1/phys_state") + [[ "$s" == *ACTIVE* && "$p" == *LinkUp* ]] && act=$((act+1)) + done + echo "[$(hostname)] IB HCAs: $act/$n ACTIVE+LinkUp" + [ "$act" -gt 0 ] || { echo "[$(hostname)] ERROR: no ACTIVE IB port"; exit 1; } +' || { trerr "IB port check failed"; fail=1; } + +# --- 2. Resolve IB_DEVICES (topology) + container RDMA check on prefill node --- +CHK_OUT="$LOG_DIR/.preflight_chk.out" +trlog "detecting GPU<->NIC topology + checking RDMA inside container on $PREFILL_NODE ..." +$SR -N1 --nodelist="$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \ + --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \ + bash -c " + echo '### DETECT ###' + python3 /inferencex/together_runner/slurm-disagg/_detect_rdma.py + echo '### CHECK ###' + python3 /inferencex/together_runner/slurm-disagg/_check_container_rdma.py ${PROBE_MOONCAKE:+--mooncake} + " 2>&1 | tee "$CHK_OUT" || { trerr "container RDMA check failed (see above)"; fail=1; } + +# Pull machine-readable values out of the captured output. +DET_IB="$(grep -m1 '^IB_DEVICES=' "$CHK_OUT" | cut -d= -f2- || true)" +DET_LL="$(grep -m1 '^IB_LINK_LAYER=' "$CHK_OUT" | cut -d= -f2- || true)" +DMABUF="$(grep -m1 '^DMABUF_SUPPORTED=' "$CHK_OUT" | cut -d= -f2- || true)" + +# Honor an explicit override; else use the detected list. +IB_FINAL="${IB_DEVICES:-$DET_IB}" +[[ -n "$IB_FINAL" ]] || { trerr "could not resolve IB_DEVICES (set it explicitly in config.env)"; fail=1; } +[[ -n "${IB_DEVICES:-}" && -n "$DET_IB" && "$IB_DEVICES" != "$DET_IB" ]] && \ + trlog "NOTE: explicit IB_DEVICES ($IB_DEVICES) differs from detected ($DET_IB) — using explicit." + +# --- 3. peermem vs dmabuf decision (host-side: module presence + driver version) --- +PEERMEM_FINAL="${WITH_NVIDIA_PEERMEM:-}" +if [[ -z "$PEERMEM_FINAL" ]]; then + trlog "deciding KV mem-registration path (nvidia_peermem vs dmabuf) ..." + DEC="$($SR -N1 --nodelist="$PREFILL_NODE" bash -c ' + drv=$(cat /sys/module/nvidia/version 2>/dev/null); maj=${drv%%.*} + if modinfo nvidia_peermem >/dev/null 2>&1; then echo "peermem $drv"; else echo "dmabuf $drv $maj"; fi')" + read -r MODE DRV MAJ <<<"$DEC" + if [[ "$MODE" == "peermem" ]]; then + trlog "nvidia_peermem AVAILABLE (driver $DRV) — using default peermem path (WITH_NVIDIA_PEERMEM unset)." + else + if [[ "${MAJ:-0}" -ge 535 ]]; then + PEERMEM_FINAL=0 + trlog "nvidia_peermem ABSENT (driver $DRV ≥535) — forcing dmabuf (WITH_NVIDIA_PEERMEM=0)." + [[ "$DMABUF" == "1" ]] || { trerr "dmabuf chosen but libibverbs lacks ibv_reg_dmabuf_mr — KV transfer will fail"; fail=1; } + else + trerr "nvidia_peermem ABSENT and driver $DRV <535 (no dmabuf) — KV transfer cannot register GPU mem"; fail=1 + fi + fi +else + trlog "WITH_NVIDIA_PEERMEM explicitly set to '$PEERMEM_FINAL' — honoring it." +fi + +# --- 4. write the resolved truth source --- +if [[ "$fail" == "0" ]]; then + mkdir -p "$LOG_DIR" + cat > "$DETECTED" <}" + trlog "next: bash 10_launch.sh" +else + trerr "PREFLIGHT FAILED — fix the above before launching (detected.env NOT written)."; exit 1 +fi diff --git a/together_runner/slurm-disagg/10_launch.sh b/together_runner/slurm-disagg/10_launch.sh new file mode 100755 index 000000000..9d8d59c27 --- /dev/null +++ b/together_runner/slurm-disagg/10_launch.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +# Launch the 1P1D disaggregated endpoint as overlap steps INTO the 2-node allocation: +# prefill@PREFILL_NODE + decode@DECODE_NODE + sgl-router@PREFILL_NODE. Waits for health +# and writes a state file (endpoint) for benchmark/teardown. +# +# bash 10_launch.sh +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$HERE/config.env"; source "$HERE/disagg_lib.sh" +load_resolved +alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; } +STATE="$LOG_DIR/disagg_state.env" + +# Resolved RDMA config (IB_DEVICES + WITH_NVIDIA_PEERMEM) comes from preflight. +DETECTED="$LOG_DIR/disagg_detected.env" +[[ -f "$DETECTED" ]] || { trlog "no $DETECTED — running 01_preflight.sh first ..."; bash "$HERE/01_preflight.sh"; } +source "$DETECTED" +[[ -n "${IB_DEVICES:-}" ]] || { trerr "IB_DEVICES unresolved after preflight"; exit 1; } + +MOUNTS="$(container_mounts)" +CG="$(cuda_graph_arg)" +# peermem path: empty => default (peermem); set => prefix WITH_NVIDIA_PEERMEM= (dmabuf when 0). +PEERMEM_PREFIX=""; [[ -n "${WITH_NVIDIA_PEERMEM:-}" ]] && PEERMEM_PREFIX="WITH_NVIDIA_PEERMEM=$WITH_NVIDIA_PEERMEM " +STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL" +trlog "RDMA: IB_DEVICES=$IB_DEVICES peermem=${WITH_NVIDIA_PEERMEM:-} alloc=$ALLOC_JOB" + +# --- prefill (with KV bootstrap server) --- +trlog "launching prefill on $PREFILL_NODE (TP$TP, cuda_graph=$([[ -z $CG ]] && echo on || echo off)) ..." +: > "$LOG_DIR/prefill.log" +nohup $STEP -N1 -w "$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \ + --container-image="$SQSH" --container-mounts="$MOUNTS" \ + bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \ + --model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \ + --host 0.0.0.0 --port $PORT --trust-remote-code \ + --disaggregation-mode prefill --disaggregation-bootstrap-port $BOOTSTRAP_PORT \ + --disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/prefill.log 2>&1" >/dev/null 2>&1 & + +# --- decode --- +trlog "launching decode on $DECODE_NODE (TP$TP) ..." +: > "$LOG_DIR/decode.log" +nohup $STEP -N1 -w "$DECODE_NODE" --gres=gpu:$GPUS_PER_NODE \ + --container-image="$SQSH" --container-mounts="$MOUNTS" \ + bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \ + --model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \ + --host 0.0.0.0 --port $PORT --trust-remote-code \ + --disaggregation-mode decode \ + --disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/decode.log 2>&1" >/dev/null 2>&1 & + +trlog "waiting for both servers to be healthy (cuda-graph capture can take several minutes) ..." +wait_health "http://$PREFILL_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "prefill never healthy; see $LOG_DIR/prefill.log"; exit 1; } +trlog "prefill healthy." +wait_health "http://$DECODE_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "decode never healthy; see $LOG_DIR/decode.log"; exit 1; } +trlog "decode healthy." + +# --- router: another overlap step on the prefill node. MUST mount the model dir so the +# tokenizer loads locally (else 404s to HF). --- +trlog "launching sgl-router on $PREFILL_NODE:$ROUTER_PORT ..." +: > "$LOG_DIR/router.log" +nohup $STEP -N1 -w "$PREFILL_NODE" \ + --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \ + bash -c "python3 -m sglang_router.launch_router --pd-disaggregation \ + --prefill http://$PREFILL_IP:$PORT $BOOTSTRAP_PORT --decode http://$DECODE_IP:$PORT \ + --host 0.0.0.0 --port $ROUTER_PORT --policy random > $LOG_DIR/router.log 2>&1" >/dev/null 2>&1 & + +wait_health "http://$PREFILL_IP:$ROUTER_PORT/health" 120 "$ALLOC_JOB" || { trerr "router never healthy; see $LOG_DIR/router.log"; exit 1; } + +cat > "$STATE" </dev/null || { trerr "endpoint $ENDPOINT not healthy"; exit 1; } +SEQTAG="$(( ISL/1024 ))k$(( OSL/1024 ))k" +trlog "sweep CONC_LIST='$CONC_LIST' at ISL=$ISL OSL=$OSL via $ENDPOINT" + +# Run the bench client inside the image (mount repo at /inferencex + model for tokenizer), +# as an overlap step on the prefill allocation. One srun does the whole sweep. +nohup srun --jobid="$ALLOC_JOB" --overlap --nodelist="$PREFILL_NODE" \ + --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \ + bash -c " + pip install -q datasets pandas >/dev/null 2>&1 || true + for C in $CONC_LIST; do + NP=\$(( C * $PROMPTS_PER_CONC )); [ \$NP -lt 160 ] && NP=160 + echo \"############ conc=\$C num_prompts=\$NP ############\" + python3 /inferencex/utils/bench_serving/benchmark_serving.py \ + --backend sglang --model $SERVED_NAME --tokenizer $MODEL_DIR \ + --base-url $ENDPOINT --endpoint /v1/completions \ + --dataset-name random --random-input-len $ISL --random-output-len $OSL \ + --max-concurrency \$C --num-prompts \$NP --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir $RESULTS_DIR \ + --result-filename ${SERVED_NAME}_${SEQTAG}_conc\${C}.bench.json 2>&1 \ + | grep -E 'Successful requests|Total Token throughput|Output token throughput|Median TTFT|P99 TTFT|Median TPOT|Median E2EL' + echo \"=== conc=\$C done ===\" + done + echo ALL_SWEEP_DONE" 2>&1 | tee "$LOG_DIR/sweep.log" + +# Summary table from the result JSONs. +trlog "==== SWEEP SUMMARY ($SEQTAG) ====" +python3 - "$RESULTS_DIR" "$SERVED_NAME" "$SEQTAG" $CONC_LIST <<'PY' +import json, sys, os +rdir, name, seqtag = sys.argv[1], sys.argv[2], sys.argv[3] +concs = sys.argv[4:] +hdr = f"{'conc':>5} {'ok':>11} {'total tok/s':>12} {'out tok/s':>10} {'mTPOT ms':>9} {'mTTFT ms':>9} {'p99TTFT ms':>11}" +print(hdr); print('-'*len(hdr)) +for C in concs: + f = os.path.join(rdir, f"{name}_{seqtag}_conc{C}.bench.json") + if not os.path.exists(f): print(f"{C:>5} (missing)"); continue + d = json.load(open(f)) + ok = f"{d.get('completed')}/{d.get('num_prompts','?')}" + print(f"{C:>5} {ok:>11} {d.get('total_token_throughput',0):>12.0f} {d.get('output_throughput',0):>10.0f} " + f"{d.get('median_tpot_ms',0):>9.1f} {d.get('median_ttft_ms',0):>9.0f} {d.get('p99_ttft_ms',0):>11.0f}") +PY +trlog "raw results: $RESULTS_DIR | sweep log: $LOG_DIR/sweep.log" diff --git a/together_runner/slurm-disagg/BENCHMARK-RECORD-qwen3-32b-disagg.md b/together_runner/slurm-disagg/BENCHMARK-RECORD-qwen3-32b-disagg.md new file mode 100644 index 000000000..3626c9aeb --- /dev/null +++ b/together_runner/slurm-disagg/BENCHMARK-RECORD-qwen3-32b-disagg.md @@ -0,0 +1,77 @@ +# Benchmark record — Qwen3-32B prefill/decode disaggregation (2-node, SGLang) + +**Date:** 2026-06-29 · **Cluster:** slinky (Slurm-on-k8s) · **Operator:** Johnsonms + +## Purpose +ClusterMAX inference-disagg phase-0 readiness proof: confirm this tenant slice can +deploy a 2-node prefill/decode-disaggregated SGLang endpoint, transfer KV cross-node +over RDMA, and serve at a representative throughput. Bring-up model: **Qwen/Qwen3-32B** +(dense), **1P1D** (1 prefill + 1 decode + sgl-router). + +## Config +- **Topology:** prefill@slinky-0 (TP8), decode@slinky-1 (TP8), sgl-router@slinky-0:8002, `--policy random`. +- **Hardware:** 2× node, each 8× B200 + 14× mlx5 IB. 16 GPUs total in the serving path. +- **Image:** `lmsysorg/sglang:dev-cu13` (sglang `0.0.0.dev1+g909123ddb`, sglang-router 0.3.2, + mooncake `0.3.11.post1`), shared squashfs `/data/home/johnson/enroot/sglang-dev-cu13.sqsh`. +- **KV transfer:** mooncake over RDMA, **dmabuf path forced via `WITH_NVIDIA_PEERMEM=0`** + (nvidia_peermem absent; driver 580). IB devices (GPU-adjacent, GPU-order): + `mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_4,mlx5_5,mlx5_6,mlx5_7`. `/dev/infiniband` bind-mounted. +- **Serving args:** `--tp 8 --trust-remote-code`, **CUDA graph ON**, prefill + `--disaggregation-mode prefill --disaggregation-bootstrap-port 9000`, decode `--disaggregation-mode decode`. +- **Bench client:** InferenceX `utils/bench_serving/benchmark_serving.py`, `--backend sglang`, + `--endpoint /v1/completions`, `--dataset-name random`, ISL=1024 OSL=1024, num_prompts=conc×8. + +## Results (1k/1k, CUDA graph) + +| conc | requests | total tok/s | output tok/s | req/s | med TPOT (ms) | med TTFT (ms) | p99 TTFT (ms) | med E2E (ms) | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| 16 | 160/160 | 2,087 | 957 | 1.10 | 13.9 | 1,591 | 4,564 | 15,560 | +| 64 | 512/512 | 4,591 | 2,110 | 2.37 | 23.9 | 3,616 | 18,651 | 28,010 | +| 128 | 1024/1024 | 4,512 | 2,080 | 2.37 | 34.1 | 6,629 | 39,334 | 41,623 | +| 256 | 2048/2048 | 4,898 | 2,243 | 2.59 | 43.1 | 26,913 | 80,696 | 66,562 | + +**0 failed requests across the entire sweep.** Raw per-point JSON: `/data/home/johnson/enroot/sweep/qwen3-32b_1k1k_conc{16,64,128,256}.bench.json`. + +### Post-refactor re-validation (2026-06-30, full sweep, allocation model) +Re-ran the full sweep after the portability refactor: single 2-node `salloc --no-shell` +allocation, prefill/decode/router/bench as overlap steps, `IB_DEVICES` + dmabuf decision +auto-detected (IB list matched the hand-derived `mlx5_9,10,11,12,4,5,6,7` exactly). +**0 failed requests across all points.** + +| conc | requests | total tok/s | output tok/s | med TPOT (ms) | med TTFT (ms) | p99 TTFT (ms) | vs 06-29 | +|---:|---:|---:|---:|---:|---:|---:|---:| +| 16 | 160/160 | 2,156 | 993 | 13.7 | 1,405 | 5,954 | +3% | +| 64 | 512/512 | 4,614 | 2,127 | 23.0 | 3,933 | 20,987 | +0.5% | +| 128 | 1024/1024 | 3,298 | 1,526 | 33.1 | 8,844 | 36,889 | **−27% (see note)** | +| 256 | 2048/2048 | 4,892 | 2,229 | 42.2 | 27,071 | 79,414 | −0.1% | + +**conc=128 anomaly:** reproducibly ~3.3–3.4k (two runs: 3,437 then 3,298), i.e. *below* +conc=64 — physically inconsistent with a healthy curve, and below the 06-29 baseline of +4,512. NOT noise (reproduces) and NOT a code regression (serving args/nodes/IB are byte-for-byte +unchanged; conc 16/64/256 match baseline). Read as a **1P1D dynamics artifact** at this +concurrency: the single prefill instance interleaves badly with decode (prefill bursts starve +decode, TTFT jumps to ~8.8s), whereas conc=64 stays stable and conc=256 is fully decode-saturated. +A candidate to revisit when scaling prefill (NP1D) or tuning chunked-prefill scheduling. + +## Findings +1. **Functional:** cross-node KV transfer works under sustained load — the whole path + (deploy → RDMA-in-pod → bootstrap → KV transfer → router → serve) is solid, 0 errors. +2. **Peak throughput ≈ 4,900 tok/s total (~306 tok/s/GPU over 16 GPUs).** +3. **Saturates at conc ≥ 64** (~4,500–4,900 tok/s). Adding concurrency past 64 yields ~no + extra throughput but sharply worse TTFT (p99 4.6 s → 80.7 s @ conc 256). +4. **Bottleneck = the single prefill instance** (1P1D): decode TPOT stays healthy (14–43 ms) + while TTFT explodes as prefill queues. To raise throughput / cut TTFT tail → add prefill + instances (e.g. 2P1D / NP1D) and/or enable chunked-prefill tuning. + +## Critical fixes that made this work (env-specific, ephemeral on pod restart) +- **`WITH_NVIDIA_PEERMEM=0`** on prefill+decode → mooncake dmabuf KV registration (else + 3072× `Failed to register memory` → `KVTransferError`). [found by codex] +- **CUDA graph ON** is mandatory for perf: eager mode gave **85 tok/s total / 366 ms TPOT / + 65% client-timeout failures** at conc 16 — a ~22× throughput / ~27× TPOT regression vs graphs on. +- enroot nvidia hook patched `--no-persistenced --no-fabricmanager`; `/dev/infiniband` mounted; + enroot import temp on ext4 `/scratch`. (See HANDOVER-disagg-kv-transfer.md / HANDBACK-codex-dmabuf.md.) + +## Provenance +Nodes slinky-0/1; sweep run as overlap step on prefill job 61 (decode job 62), router :8002. +Server logs: `/data/home/johnson/enroot/{prefill-cg,decode-cg,router-sweep}.log`. Sweep +driver log: `/data/home/johnson/enroot/sweep.log`. diff --git a/together_runner/slurm-disagg/CLAUDE.md b/together_runner/slurm-disagg/CLAUDE.md new file mode 100644 index 000000000..57848a31c --- /dev/null +++ b/together_runner/slurm-disagg/CLAUDE.md @@ -0,0 +1,60 @@ +# slurm-disagg — Claude Code memory (quick debug/ramp-up) + +2-node SGLang **prefill/decode-disaggregated** benchmark harness on Slurm + enroot/pyxis. +Productized ClusterMAX disagg phase-0 proof. **Portable**: meant to run unmodified on any +such cluster. Sibling of `../CLAUDE.md` (single-node together_runner) but separate code. + +## State (2026-06-30) +- Committed on branch `together-runner-slurm-disagg`, **PR #2** open vs main + (togethercomputer/InferenceX). Author `Johnsonms `, no Claude trailer. +- Validated on slinky (Qwen3-32B 1P1D, TP8): conc 16/64/256 match 06-29 baseline, 0 failures. +- **OPEN issue**: conc=128 reproducible throughput dip (~3.3k, below conc64). NOT the refactor + (CPU alloc identical old vs new — proven). See `INVESTIGATE-conc128.md`. Deferred. + +## Architecture (how it runs) +ONE 2-node allocation, everything is an overlap step into it: +``` +00_setup.sh -> salloc -N2 --no-shell (ALLOC_JOB) ; patch enroot nvidia hook ; enroot import ; hf download +01_preflight -> auto-detect IB_DEVICES + peermem/dmabuf ; verify IB ports + bind-mount + dmabuf +10_launch.sh -> prefill@node0 + decode@node1 + router@node0 (all `srun --jobid=$ALLOC --overlap`) +20_benchmark -> bench client (overlap step on node0) -> result JSONs + summary +teardown.sh -> scancel $ALLOC_JOB ; rm nodes/state files +run_all.sh -> 00 -> 01 -> 10 -> 20 +``` +Resolved state lives in `$LOG_DIR` (= `$ENROOT_DIR` = `$HOME/enroot`): +- `disagg_nodes.env` — ALLOC_JOB, PREFILL/DECODE_NODE+IP, partition (written by 00_setup). +- `disagg_detected.env` — IB_DEVICES, WITH_NVIDIA_PEERMEM (written by 01_preflight). +- `disagg_state.env` — ENDPOINT (written by 10_launch). +- `load_nodes()` (nodes only, for preflight) vs `load_resolved()` (nodes + detected, for launch/bench). + +## What's AUTO vs PINNED +- AUTO: partition (first w/ ≥2 idle GPU nodes), nodes (Slurm-assigned), IB_DEVICES + (`nvidia-smi topo -m` + `/sys` link_layer, majority-fabric), WITH_NVIDIA_PEERMEM + (peermem present→default; absent+drv≥535→0/dmabuf), enroot temp (first non-overlay fs). +- PINNED on purpose: GPUS_PER_NODE=8, TP=8 (B200). Override anything via env / config.env. + +## Gotchas / hard-won facts (don't relearn these) +- **enroot nvidia-hook sed-patch is unavoidable** (needs sudo, ephemeral). enroot 4.0.1 runs + system+user hooks.d with NO basename dedup (user hook can't override system 98-nvidia.sh); + pyxis ignores per-job ENROOT_SYSCONF_PATH. Both verified. Patch is idempotent + self-checking. +- **KV transfer needs the dmabuf path** here (`WITH_NVIDIA_PEERMEM=0`) — `nvidia_peermem` absent + (driver 580, only gdrdrv+nvidia_fs). Without it: 3072× `Failed to register memory` → KVTransferError. +- **CUDA graph mandatory** (default ON) — eager ≈ 22× slower / 65% timeouts. +- **enroot import temp must be non-overlay fs** — pod `/` is overlayfs, can't mknod whiteouts. +- **Every step gets only 2 CPUs** on slinky (CR_CORE_MEMORY, no DefCpuPerGPU) — suspected + throughput cap; see INVESTIGATE-conc128.md item 1 (`--exclusive`/`--cpus-per-task`). +- Router MUST mount the model dir (else tokenizer 404s to HF); runs on the prefill node. + +## Debug entry points +- Server logs: `$HOME/enroot/{prefill,decode,router}.log`. Sweep: `$HOME/enroot/sweep.log`, + results `$HOME/enroot/sweep/*.bench.json`. +- Liveness while running: `tail -f decode.log` (look for `Decode batch ... gen throughput`), + `squeue --me` (ALLOC_JOB), endpoint `curl $ENDPOINT/health`. +- Re-detect RDMA only: `bash 01_preflight.sh` (rewrites detected.env from clean config). +- Re-run subset: `CONC_LIST="64" bash 20_benchmark.sh` (endpoint must be up). +- Cross-cluster: just `bash run_all.sh` — if it fails, preflight prints the exact reason. + +## Related docs +`README.md` (usage), `PORTABILITY-ANALYSIS.md` (root-cause table + decisions), +`BENCHMARK-RECORD-qwen3-32b-disagg.md` (numbers), `INVESTIGATE-conc128.md` (open perf question). +Testbed/node fixes: see repo memory `disagg-enroot-node-setup`. diff --git a/together_runner/slurm-disagg/INVESTIGATE-conc128.md b/together_runner/slurm-disagg/INVESTIGATE-conc128.md new file mode 100644 index 000000000..1f75cd519 --- /dev/null +++ b/together_runner/slurm-disagg/INVESTIGATE-conc128.md @@ -0,0 +1,95 @@ +# Investigation: conc=128 throughput dip (1P1D, Qwen3-32B, slinky) + +Status: **OPEN — deferred**. Captured 2026-06-30 so we can resume without re-deriving. +Owner: johnson. Related: `BENCHMARK-RECORD-qwen3-32b-disagg.md`, `PORTABILITY-ANALYSIS.md`. + +## The observation +1k/1k sweep, 1P1D, TP8 each. **0 failed requests at every point.** conc 16/64/256 match the +06-29 baseline; **conc=128 is reproducibly low**: + +| conc | 06-29 baseline | 06-30 run1 | 06-30 run2 | note | +|---:|---:|---:|---:|---| +| 16 | 2,087 | 1,874 | 2,156 | run1 low, run2 on-baseline | +| 64 | 4,591 | 4,614 | — | ✓ | +| 128 | 4,512 | 3,437 | 3,298 | **reproducibly ~3.3–3.4k, below conc=64** | +| 256 | 4,898 | 4,892 | — | ✓ | + +The dip is *physically odd*: conc=128 sits **below** conc=64, and TTFT jumps (med ~6.8–8.8 s, +p99 ~37 s). On a healthy curve 128 should sit between 64 and 256. + +## Is it our refactor? — NO (proven 2026-06-30) +Hypothesis was that the allocation/overlap-step model (this PR) changed CPU/resource isolation +vs the old separate-`srun`-job model, starving prefill at the conc=128 operating point. + +**Disproved by direct probe** — both models grant the SAME CPUs per node: + +``` +# NEW model (overlap step into 'salloc -N2 --gres=gpu:8' allocation): +srun --jobid=$ALLOC --overlap -N1 -w slinky-0 bash -c 'nproc; grep Cpus_allowed_list /proc/self/status' + -> nproc=2 Cpus_allowed_list=0-1 SLURM_CPUS_ON_NODE=2 + +# OLD model (standalone job, what 06-29 used): +srun -p slinky -N1 -w slinky-0 --gres=gpu:8 bash -c 'nproc; grep Cpus_allowed_list /proc/self/status' + -> nproc=2 Cpus_allowed_list=0-1 SLURM_CPUS_ON_NODE=2 +``` + +Both = 2 CPUs/node (cluster is `SelectTypeParameters=CR_CORE_MEMORY` with **no `DefCpuPerGPU`**, +so a gpu:8 request gets the minimal 1–2 cores). Serving args / nodes / IB list are byte-for-byte +identical between the runs. **So the refactor is not the cause of the 128 dip.** + +Most likely: **environmental / 1P1D dynamics variance** — at conc=128 the single prefill instance +interleaves badly with decode (prefill bursts starve decode), an unstable operating point; 06-29's +4,512 was a luckier sample. (conc=16 also swung 1,874↔2,156 between runs → this setup has real +run-to-run variance.) + +## Separate finding worth chasing: both models are CPU-starved to 2 cores +Every server (prefill, decode, router, bench client) runs pinned to **2 CPUs**. CPU-bound work +(tokenization, scheduler, sampling, FastAPI/uvicorn, mooncake orchestration) is throttled. This +likely **caps throughput across ALL points**, and more CPU headroom may also **stabilize the 128 +point**. This is the highest-value lever and is independent of the refactor. + +## Plan to resume (ranked) +1. **Give the allocation real CPUs, re-sweep.** Easiest, highest upside. In `disagg_lib.sh` + `ensure_allocation`, add to the `salloc`: `--cpus-per-task` (or `--exclusive`, or + `--cpus-per-gpu=`). Probe the node core count first (`sinfo -h -n -o '%c'` showed + the *allocated* 2, not physical — check `scontrol show node | grep CPUTot`; the box has + ~160). Try `--exclusive` (whole node) → re-run conc 128 (and full sweep). Expectation: all + points rise; if 128 also normalizes, the dip was CPU contention after all. +2. **A/B old vs new launch at conc=128**, CPU held constant, to fully close the refactor question + (should match — both 2 CPU). Old launch commands preserved below. +3. **Instrument the 128 point.** While conc=128 runs, watch: + - prefill log: `Prefill batch ... #queue-req`, input throughput — is prefill the bottleneck? + - decode log: `Decode batch ... #running-req, gen throughput` — is decode starved (running-req + oscillating / low) at 128 but steady at 256? + - `nvidia-smi dmon` / GPU util on both nodes; CPU util of the 2 allowed cores (likely pegged). +4. **Sweep around it**: conc 96 / 128 / 160 / 192 to map whether it's a single bad point or a + trough between 64 and 256. + +## Reproducing the ORIGINAL (06-29) conditions — so we don't lose the 4,512 baseline +The 06-29 run used the OLD launch (separate jobs, hardcoded nodes) + first 20_benchmark. That code +was refactored in place (not in git), but the exact commands are preserved in +`~/enroot/HANDOVER-disagg-kv-transfer.md` + `HANDBACK-codex-dmabuf.md`, and reproduced here: + +```bash +IMG=$HOME/enroot/sglang-dev-cu13.sqsh +MODEL=$HOME/models/qwen3-32b +IB=mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_4,mlx5_5,mlx5_6,mlx5_7 +MOUNTS=$MODEL:$MODEL,$HOME/enroot:$HOME/enroot,$HOME/.cache/huggingface:/root/.cache/huggingface,/dev/infiniband:/dev/infiniband + +# PREFILL (slinky-0) — standalone job, CUDA graph ON (omit --disable-cuda-graph): +srun -p slinky -N1 -w slinky-0 --gres=gpu:8 --job-name=pd_prefill \ + --container-image=$IMG --container-mounts=$MOUNTS \ + bash -c 'WITH_NVIDIA_PEERMEM=0 python3 -m sglang.launch_server --model-path '$MODEL' \ + --served-model-name qwen3-32b --tp 8 --host 0.0.0.0 --port 30000 --trust-remote-code \ + --disaggregation-mode prefill --disaggregation-bootstrap-port 9000 --disaggregation-ib-device '$IB +# DECODE (slinky-1): same but --disaggregation-mode decode (no bootstrap port). +# ROUTER: srun --jobid= --overlap -w slinky-0 ... launch_router --pd-disaggregation \ +# --prefill http://:30000 9000 --decode http://:30000 --port 8002 --policy random +``` +Note: 06-29 also ran on **2 CPUs/node** (same default), so the 4,512 should be reproducible under +the same conditions — the gap is variance, not a lost config. For a *deterministic* baseline, pin +CPUs via `--exclusive` (item 1) in BOTH old and new and compare. + +## Not a blocker +0 failures, correctness intact, peak (4,892) matches baseline. conc=128 is a perf-shape question, +documented for later — does not block the harness landing. diff --git a/together_runner/slurm-disagg/PORTABILITY-ANALYSIS.md b/together_runner/slurm-disagg/PORTABILITY-ANALYSIS.md new file mode 100644 index 000000000..814c59a63 --- /dev/null +++ b/together_runner/slurm-disagg/PORTABILITY-ANALYSIS.md @@ -0,0 +1,82 @@ +# slurm-disagg — 坑梳理与可移植性重构设计 + +整个 2-node SGLang PD-disagg bring-up 过程中遇到的坑、根因、以及让方案在**新集群** +上仍然稳定的重构计划。bring-up 已在 slinky(Slurm-on-k8s, 2×8B200)用 Qwen3-32B +1P1D 跑通并出 benchmark(见 BENCHMARK-RECORD),本文是「跑通 → 可靠固化」的依据。 + +## 坑分类(关键维度:换新集群会不会爆) + +| # | 坑 | 类别 | 新集群风险 | +|---|---|---|---| +| 1 | enroot import temp 必须 ext4 | A 环境硬约束 | 低 | +| 2 | enroot nvidia hook sed 补丁 (`--no-persistenced --no-fabricmanager`) | C 可消除 hack | 中 | +| 3 | bind-mount `/dev/infiniband` | A 环境硬约束 | 低 | +| 4 | router `--jobid= --overlap` | A Slurm gang sched | 低 | +| 5 | router 必须挂 model dir | A 配置正确性 | 低 | +| 6 | `WITH_NVIDIA_PEERMEM=0`(强制 dmabuf) | B 需检测 | 高 | +| 7 | `IB_DEVICES` GPU↔NIC 邻接表硬编码 | B 需检测 | 最高 | +| 8 | CUDA graph 必须开 | D 性能默认 | 低 | + +## 根因 + +- **#1** pod 的 `/` 是 overlayfs,`enroot import` 的 `mknod` whiteout 在 overlay-on-overlay + 上被内核禁止 → `aufs2ovlfs ... Operation not permitted`。已用 `ENROOT_TEMP_PATH=/scratch`(ext4)解。正确解,非 hack。 +- **#2** `nvidia-container-cli` 想 bind-mount persistenced/fabricmanager 的 socket,pod 里没有 → 启动失败。当前用 `sudo sed -i` 改全局 hook(最脏:sudo + 改系统文件 + 重启重做)。 +- **#3** pyxis 不自动透传 IB 字符设备。bind-mount 即正确解。 +- **#4** prefill 用满 `gres=gpu:8`,Slurm 不在满载节点上调度新 job;router 不要 GPU,用 overlap step 进 prefill 的 alloc。 +- **#5** router 加载 tokenizer,没有本地 model dir 就 404 到 HF。 +- **#6** `nvidia_peermem` 模块缺失(驱动 580,仅 gdrdrv+nvidia_fs);mooncake 默认走 peermem `ibv_reg_mr` 注册 GPU 显存失败 → 3072× `Failed to register memory` → `KVTransferError`。`=0` 强制 dmabuf(`ibv_reg_dmabuf_mr`,驱动 ≥535 支持)。探针:256MiB CUDA tensor,无 env 返回 -202,`=0` 返回 0。 +- **#7** `mlx5_9,10,11,12,4,5,6,7` 是本机 GPU0..7→NIC 顺序,且避开 mlx5_0–3(存储 NIC)。物理拓扑,每集群/机型不同(GB200/GB300≠B200)。 +- **#8** eager 比 graph 慢 ~22×、65% 超时。默认 ON。 + +## 决策(已确认) + +1. **环境相关项(#6/#7)默认自动探测**:`IB_DEVICES`、`WITH_NVIDIA_PEERMEM` 默认留空 = + preflight 自动探测填充;手动 export 可覆盖。 +2. **#2 尝试消除 → 实验结论:当前栈上无法干净消除,退回加固 sed 补丁。** + - 用户级 hook 覆盖**不可行**:enroot 4.0.1 `runtime.sh:93` 对 `[系统, 用户]` 两个 + hooks.d **各跑一遍、不按文件名去重**,用户那份 `98-nvidia.sh` 只会追加执行,无法 + 替换系统那份(系统那份照样先跑、照样失败)。 + - `ENROOT_SYSCONF_PATH` 重定向**不可行**:实测 `srun --export=ALL,ENROOT_SYSCONF_PATH=` + 启容器,哨兵 hook 未生效 → **pyxis 忽略该变量**。改 pyxis plugstack.conf 又需 root,与 sudo 同级,无收益。 + - 故保留 `sudo sed` patch,但做成:幂等 + sudo 不可用清晰报错 + patch 后 grep 校验,失败自动还原 .bak。 + +## 重构计划 + +1. **新增 `01_preflight.sh`** + 在 `disagg_lib.sh` 加探测函数: + - `detect_ib_devices`:`nvidia-smi topo -m` / `/sys/class/infiniband//device` PCIe + affinity → 按 GPU 顺序生成 `IB_DEVICES`,过滤非 GPU NIC;与手动值比对告警。 + - `check_ib_ports`:每节点逐端口 `state==ACTIVE` & `phys_state==LinkUp`,记录 `link_layer` + (IB vs RoCE)、`rate`。 + - `probe_dmabuf`:容器内 256MiB CUDA tensor mooncake register_memory,带/不带 + `WITH_NVIDIA_PEERMEM=0` 各一次 → 自动判定路径;并查 `modinfo nvidia_peermem` + 驱动版本。 + - `check_ibv_in_container`:ctypes `ibv_get_device_list` 数 == 主机枚举数; + `libibverbs` 是否导出 `ibv_reg_dmabuf_mr`。 +2. **`config.env`**:`IB_DEVICES` / `WITH_NVIDIA_PEERMEM` 默认空 → 自动探测。 +3. **#2 hook**:试用户级 hook 覆盖;失败退回幂等 sed + 自检。 +4. **就绪判定**:`wait_health` 外补「cuda graph capture done」标志,避免半就绪。 +5. **收尾**:BENCHMARK-RECORD 收进本目录(README 已相对引用,文件实际在 `~/enroot/`);补 `.gitignore`。 + +## 第二轮:消除剩余 hardcode(换集群零改动) + +审计发现 IB/peermem 之外还有 5 处 cluster 特定项,处理如下: + +| 项 | 原来 | 现在 | +|---|---|---| +| 节点名 `slinky-0/1` | `--nodelist` 写死 | **单个 2-node allocation**(`salloc --no-shell`),prefill/decode/router/bench 全是 `--jobid=$ALLOC --overlap` step;节点由 Slurm 分配,持久化到 `disagg_nodes.env` | +| 分区 `slinky` | 写死 | 空=自动选「含 ≥2 idle GPU 节点」的分区(可 `PARTITION=` 覆盖) | +| `GPUS_PER_NODE`/`TP`=8 | 写死 | **保留写死**(B200 专用,刻意不探测) | +| enroot temp `/scratch` | 写死、假设 ext4 | 00_setup 在节点上自动探测首个可写的非 overlay/tmpfs 目录(`/scratch /raid /var/tmp ...`,可 `ENROOT_SCRATCH=` 覆盖) | +| 路径 `/data/home/johnson/...` | 写死 | 默认 `$HOME/...`;`ENROOT_DIR`/`MODELS_ROOT` 必须在跨节点共享盘(preflight 验证) | +| `_detect_rdma` 取 `ports/1` | 假设单端口 | (已知小限制)多端口 HCA 需扩展;当前取端口 1 | + +架构收益:teardown 只需 `scancel` 一个 allocation;所有 step 共享它;再不依赖任何节点名。 +`load_nodes`(仅节点)给 preflight 用,`load_resolved`(节点+IB/peermem)给 launch/bench 用—— +preflight 每次从干净 config 重新探测,不会把自己上轮产出的 detected.env 误当成用户显式配置。 + +端到端在 slinky 复测(allocation 模型):Slurm 自动分到 slinky-[0-1],自动探测 IB 列表与 +之前手写一致,KV 传输正常。 + +完成后端到端重跑出 benchmark,再提交(push fork,PR body→md,无 Claude trailer)。 + +相关记忆:[[disagg-enroot-node-setup]] · [[inferencex-together-runner]] diff --git a/together_runner/slurm-disagg/README.md b/together_runner/slurm-disagg/README.md new file mode 100644 index 000000000..2e7a30a1c --- /dev/null +++ b/together_runner/slurm-disagg/README.md @@ -0,0 +1,76 @@ +# slurm-disagg — 2-node SGLang prefill/decode-disaggregated benchmark + +Reproducible harness for the ClusterMAX inference-disagg phase-0 readiness proof: +deploy a 2-node prefill/decode-disaggregated SGLang endpoint on a Slurm + enroot/pyxis +cluster, transfer KV cross-node over RDMA, and benchmark. + +**Portable by design — runs unmodified on a new cluster.** The harness grabs ONE 2-node +allocation (`salloc --no-shell`) and runs prefill/decode/router/bench as overlap steps +into it, so **Slurm picks the nodes** (no node names hardcoded). Partition, IB device +list, the peermem-vs-dmabuf KV path, and the enroot temp dir are all **auto-detected**; +only GPUs/node (8) and TP (8) are pinned, on purpose, for B200. + +Validated 2026-06-29 with **Qwen3-32B** (1P1D, TP8 each): peak ~4,900 tok/s total +(~306 tok/s/GPU), 0 failures. See `BENCHMARK-RECORD-qwen3-32b-disagg.md` (re-validated 2026-06-30 post-refactor). + +## Run it +```bash +cd together_runner/slurm-disagg +bash 00_setup.sh # one-time per pod boot: node fixes + image import + weights (idempotent) +bash 01_preflight.sh # auto-detect IB devices + peermem path, verify RDMA — seconds; writes detected.env +bash 10_launch.sh # prefill + decode + router; waits healthy; writes state file +bash 20_benchmark.sh # concurrency sweep -> result JSONs + summary table +bash teardown.sh # scancel the servers +# or: bash run_all.sh # 00 -> 01 -> 10 -> 20 in sequence +``` +All config in `config.env` (sourced everywhere; override by exporting first). +`IB_DEVICES` and `WITH_NVIDIA_PEERMEM` default to **empty = auto-detected by preflight** +— set them explicitly only to override. `PROBE_MOONCAKE=1 bash 01_preflight.sh` adds the +heavy mooncake `register_memory` GPU probe. + +## Layout +- `config.env` — paths, model, ports, sweep params. Partition/nodes/IB/peermem/enroot-temp + default to AUTO; GPUs/node + TP pinned to 8 (B200). `ENROOT_DIR`/`MODELS_ROOT` default to + `$HOME` and must be on a cross-node-shared FS (preflight verifies). +- `disagg_lib.sh` — allocation/node resolution (`ensure_allocation`, `resolve_partition`), + container-mounts, health-wait, cuda-graph helpers, `load_nodes`/`load_resolved`. +- `00_setup.sh` — grab/reuse the 2-node allocation (persists `disagg_nodes.env`), ephemeral + node fixes, `enroot import` (auto-detects a node-local non-overlay temp), `hf download`. Idempotent. +- `01_preflight.sh` — detect GPU↔NIC topology + peermem/dmabuf decision + IB-port/RDMA + checks on both nodes; writes `$LOG_DIR/disagg_detected.env` (the resolved RDMA truth source). +- `_detect_rdma.py` — `nvidia-smi topo -m` + `/sys` link_layer → GPU-ordered IB device list. +- `_check_container_rdma.py` — in-container ibv device count, dmabuf export, optional mooncake probe. +- `10_launch.sh` — sources detected.env; launch prefill/decode/router, wait healthy, write `disagg_state.env`. +- `20_benchmark.sh` — sweep via `utils/bench_serving/benchmark_serving.py`, summary table. +- `teardown.sh` — cancel jobs. `run_all.sh` — full pipeline. + +## Environment gotchas (this is Slurm-on-k8s; nodes are nested pods) +Preflight (`01_preflight.sh`) now **detects or verifies** most of these so a new cluster +fails in seconds with a clear reason instead of mid-launch: +1. **enroot import temp must be ext4** (`ENROOT_TEMP_PATH=/scratch/...`) — overlay `/` + can't `mknod` the overlayfs whiteouts → `aufs2ovlfs ... Operation not permitted`. +2. **enroot nvidia hook** patched with `--no-persistenced --no-fabricmanager` (00_setup.sh, + needs sudo) — else `nvidia-container-cli` can't bind-mount those sockets in the pod. + *This sed-patch is unavoidable on this stack:* enroot 4.0.1 runs system+user `hooks.d` + with no basename dedup (a user hook can't replace the system one), and pyxis ignores a + per-job `ENROOT_SYSCONF_PATH` redirect — both verified. The patch is idempotent + self-checking. +3. **RDMA-to-pod** = bind-mount `/dev/infiniband` (in `container_mounts`). Preflight verifies + libibverbs sees the same device count as `/sys/class/infiniband`. (Image lacks `ibv_devinfo`/`rdma` CLIs — cosmetic.) +4. **KV-mem registration path auto-decided** by preflight: `nvidia_peermem` present → default + path; absent + driver ≥535 → `WITH_NVIDIA_PEERMEM=0` (mooncake dmabuf). Without the right + choice: thousands of `Failed to register memory` → `KVTransferError` on routed requests. +5. **IB device list auto-detected** (`--disaggregation-ib-device`): GPU↔NIC PCIe affinity from + `nvidia-smi topo -m`, filtered to the majority fabric (IB drops the Ethernet storage NICs; + RoCE keeps Ethernet). Override via `IB_DEVICES=...`. +6. **CUDA graph ON** (default) is mandatory for perf — eager mode is ~22x slower with + 65% client timeouts. (`DISABLE_CUDA_GRAPH=1` only for debugging.) +7. **Router** runs as an `srun --jobid= --overlap` step (Slurm won't co-schedule a + fresh job on a full node) and **mounts the model dir** (else tokenizer 404s to HF). + +Fixes 1–2 are ephemeral and reset on pod restart — re-run `00_setup.sh`. Detection (3–5) is +re-run each `01_preflight.sh`, so it self-adjusts on a new cluster/HW. + +## Known limitation (current config) +1P1D throughput saturates ~conc 64 (the single prefill instance is the bottleneck); TTFT +tail grows sharply with concurrency. For higher throughput / lower TTFT, scale prefill +instances (NP1D) — a future extension of `10_launch.sh`. diff --git a/together_runner/slurm-disagg/_check_container_rdma.py b/together_runner/slurm-disagg/_check_container_rdma.py new file mode 100644 index 000000000..779343cf9 --- /dev/null +++ b/together_runner/slurm-disagg/_check_container_rdma.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +"""In-container RDMA readiness checks for the disagg KV path. Run inside the +SGLang image (pyxis) on a GPU node. Verifies, in order: + + 1. /dev/infiniband is visible and libibverbs enumerates the HCAs (== the count + visible under /sys/class/infiniband) — i.e. the bind-mount worked. + 2. libibverbs exports ibv_reg_dmabuf_mr (the peermem-free GPU-mem path). + 3. (optional, --mooncake) the Mooncake transfer engine can register a CUDA + buffer with WITH_NVIDIA_PEERMEM=0 — the single decisive check for whether + cross-node KV transfer will work. Heavy (allocates GPU mem); opt-in. + +Exit 0 if all *required* checks pass. Prints a one-line report per check to +stderr; machine-readable KEY=VALUE to stdout. +""" +import ctypes, glob, os, sys + + +def err(*a): + print(*a, file=sys.stderr) + + +def check_ibv(): + sys_n = len(glob.glob("/sys/class/infiniband/*")) + try: + ib = ctypes.CDLL("libibverbs.so.1") + except OSError as e: + err(f"[rdma] FAIL: cannot load libibverbs.so.1: {e}") + return False, sys_n, 0, False + ib.ibv_get_device_list.restype = ctypes.POINTER(ctypes.c_void_p) + n = ctypes.c_int(0) + lst = ib.ibv_get_device_list(ctypes.byref(n)) + cnt = n.value + if lst: + ib.ibv_free_device_list(lst) + dmabuf = hasattr(ib, "ibv_reg_dmabuf_mr") + ok = cnt > 0 and cnt == sys_n + lvl = "OK" if ok else "FAIL" + err(f"[rdma] {lvl}: libibverbs sees {cnt} device(s); /sys shows {sys_n} " + f"(/dev/infiniband bind-mount {'works' if cnt>0 else 'MISSING'})") + err(f"[rdma] {'OK' if dmabuf else 'WARN'}: ibv_reg_dmabuf_mr " + f"{'exported' if dmabuf else 'ABSENT (dmabuf path unavailable)'}") + return ok, sys_n, cnt, dmabuf + + +def check_mooncake(): + """256 MiB CUDA tensor; Mooncake register_memory must return 0 with + WITH_NVIDIA_PEERMEM=0 (dmabuf). Returns True/False/None(unavailable).""" + os.environ.setdefault("WITH_NVIDIA_PEERMEM", "0") + try: + import torch + from mooncake.engine import TransferEngine # noqa + except Exception as e: + err(f"[rdma] SKIP mooncake probe: import failed ({e})") + return None + try: + eng = TransferEngine() + # hostname/device auto; minimal init varies by mooncake version — guard. + buf = torch.empty(256 * 1024 * 1024 // 4, dtype=torch.float32, device="cuda") + ptr = buf.data_ptr() + rc = eng.register_memory(ptr, buf.numel() * 4) + if rc == 0: + eng.unregister_memory(ptr) + err(f"[rdma] {'OK' if rc==0 else 'FAIL'}: mooncake register_memory rc={rc} " + f"(WITH_NVIDIA_PEERMEM={os.environ['WITH_NVIDIA_PEERMEM']})") + return rc == 0 + except Exception as e: + err(f"[rdma] SKIP mooncake probe: engine init failed ({e})") + return None + + +def main(): + want_mooncake = "--mooncake" in sys.argv + ok, sys_n, cnt, dmabuf = check_ibv() + print(f"IBV_DEVICE_COUNT={cnt}") + print(f"SYS_IB_COUNT={sys_n}") + print(f"DMABUF_SUPPORTED={'1' if dmabuf else '0'}") + if want_mooncake: + mc = check_mooncake() + print(f"MOONCAKE_DMABUF_OK={'1' if mc else ('0' if mc is False else 'skip')}") + if mc is False: + ok = False + return 0 if ok else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/together_runner/slurm-disagg/_detect_rdma.py b/together_runner/slurm-disagg/_detect_rdma.py new file mode 100644 index 000000000..bd1e347f7 --- /dev/null +++ b/together_runner/slurm-disagg/_detect_rdma.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""Auto-detect the GPU-adjacent RDMA NIC list for --disaggregation-ib-device. + +Portable across clusters/HW (B200/GB200/GB300, IB or RoCE): parses +`nvidia-smi topo -m` for each GPU's closest NIC (best PCIe relationship), maps +NIC -> mlx5 name via the topo legend, then keeps only NICs whose port is +ACTIVE/LinkUp and whose link_layer matches the *majority* fabric among the +GPU-adjacent NICs (so InfiniBand clusters drop the Ethernet storage NICs, and +RoCE clusters keep their Ethernet NICs). + +Emits to stdout: + IB_DEVICES= (the value for --disaggregation-ib-device) + IB_LINK_LAYER= +and a human-readable report to stderr. Exit non-zero if detection fails so the +caller can fall back to an explicit IB_DEVICES or abort. + +Run on the host node OR inside the container (needs nvidia-smi + /sys/class/infiniband). +""" +import os, re, subprocess, sys +from collections import Counter + +SYS_IB = "/sys/class/infiniband" +# PCIe relationship preference (closest first). PIX = same switch (ideal). +RANK = {"PIX": 0, "PXB": 1, "PHB": 2, "NODE": 3, "SYS": 4} + + +def err(*a): + print(*a, file=sys.stderr) + + +def port_attr(dev, name): + try: + with open(f"{SYS_IB}/{dev}/ports/1/{name}") as f: + return f.read().strip() + except OSError: + return "" + + +def topo(): + """Return (gpu_to_nics, nic_to_dev): per-GPU NIC indices ranked best-first, + and NIC index -> mlx5 device name (from the legend).""" + out = subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True) + if out.returncode != 0: + raise RuntimeError(f"nvidia-smi topo -m failed: {out.stderr.strip()}") + # nvidia-smi underlines the header with ANSI escapes — strip them so the + # GPU0/NIC0 labels are plain word tokens. + lines = re.sub(r"\x1b\[[0-9;]*m", "", out.stdout).splitlines() + + # NIC legend: " NIC3: mlx5_3" + nic_to_dev = {} + for ln in lines: + m = re.match(r"\s*NIC(\d+):\s*(\S+)", ln) + if m: + nic_to_dev[int(m.group(1))] = m.group(2) + + # Header row: locate the column index of each GPU and NIC label. + header = None + for ln in lines: + if re.search(r"\bGPU0\b", ln) and re.search(r"\bNIC0\b", ln): + header = ln + break + if header is None: + raise RuntimeError("could not find topo matrix header (GPU0..NIC0)") + cols = header.split() # ['GPU0','GPU1',...,'NIC0',...,'CPU','Affinity',...] + col_label = {} # position-in-cols -> label + for i, tok in enumerate(cols): + if re.fullmatch(r"GPU\d+", tok) or re.fullmatch(r"NIC\d+", tok): + col_label[i] = tok + + gpu_to_nics = {} + for ln in lines: + toks = ln.split() + if not toks or not re.fullmatch(r"GPU\d+", toks[0]): + continue + gpu = int(toks[0][3:]) + # The cells align to the header columns after the row label. Cells are + # the matrix entries (X / NV# / PIX / PHB / SYS ...). Re-split the data + # row the same way as header so column positions line up. + # toks[0] is the GPU label; remaining tokens are cells then CPU-affinity. + ranked = [] + for i, tok in enumerate(cols): + lbl = col_label.get(i) + if not lbl or not lbl.startswith("NIC"): + continue + # cell for this column = toks[i+1] (row label shifts everything by 1) + if i + 1 >= len(toks): + continue + cell = toks[i + 1] + if cell in RANK: + ranked.append((RANK[cell], int(lbl[3:]))) + ranked.sort() + gpu_to_nics[gpu] = [n for _, n in ranked] + return gpu_to_nics, nic_to_dev + + +def main(): + try: + gpu_to_nics, nic_to_dev = topo() + except Exception as e: + err(f"[detect_rdma] ERROR: {e}") + return 2 + if not gpu_to_nics: + err("[detect_rdma] ERROR: no GPUs found in topo matrix") + return 2 + + # Collect each GPU's best-rank NIC candidates (those tied at the top rank). + candidates = [] # (gpu, [dev,...]) usable RDMA devices at best PCIe rank + for gpu in sorted(gpu_to_nics): + nics = gpu_to_nics[gpu] + devs = [] + for n in nics: + dev = nic_to_dev.get(n) + if not dev or not os.path.isdir(f"{SYS_IB}/{dev}"): + continue + state = port_attr(dev, "state") # "4: ACTIVE" + phys = port_attr(dev, "phys_state") # "5: LinkUp" + ll = port_attr(dev, "link_layer") # InfiniBand | Ethernet + if "ACTIVE" not in state or "LinkUp" not in phys: + continue + devs.append((dev, ll)) + candidates.append((gpu, devs)) + + # Majority fabric among all GPU-adjacent usable NICs -> drops the off-fabric + # NICs (e.g. Ethernet storage NICs on an IB cluster). + fabric = Counter(ll for _, devs in candidates for _, ll in devs) + if not fabric: + err("[detect_rdma] ERROR: no ACTIVE/LinkUp RDMA NIC adjacent to any GPU") + return 3 + majority_ll = fabric.most_common(1)[0][0] + + chosen = [] + for gpu, devs in candidates: + pick = next((d for d, ll in devs if ll == majority_ll), None) + if pick is None: + err(f"[detect_rdma] WARN: GPU{gpu} has no {majority_ll} NIC at best rank " + f"(candidates: {[d for d,_ in devs] or 'none'}) — skipped") + continue + chosen.append(pick) + + if len(chosen) != len(candidates): + err(f"[detect_rdma] WARN: matched {len(chosen)}/{len(candidates)} GPUs to a NIC") + if not chosen: + err("[detect_rdma] ERROR: no NICs chosen") + return 3 + + err(f"[detect_rdma] fabric={majority_ll} per-GPU NIC: " + + ", ".join(f"GPU{g}->{d}" for (g, _), d in zip(candidates, chosen))) + print(f"IB_DEVICES={','.join(chosen)}") + print(f"IB_LINK_LAYER={majority_ll}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/together_runner/slurm-disagg/config.env b/together_runner/slurm-disagg/config.env new file mode 100644 index 000000000..57cea4856 --- /dev/null +++ b/together_runner/slurm-disagg/config.env @@ -0,0 +1,70 @@ +# slurm-disagg — unified config for the 2-node SGLang prefill/decode-disaggregated +# benchmark on a Slurm + enroot/pyxis cluster. Sourced by every script; override any +# value by exporting it before sourcing. Cluster-specific bits (partition, nodes, IB +# devices, peermem path, enroot temp) default to AUTO and are resolved at runtime — +# the goal is to run unmodified on any such cluster. +# +# This file is sourced, not executed. Keep it side-effect free. + +# ---- Slurm allocation ---- +# The harness grabs ONE 2-node allocation (salloc --no-shell) and runs prefill/decode/ +# router/bench as overlap steps into it — so Slurm picks the nodes (no node names to +# hardcode). 00_setup.sh resolves these and persists job id + nodes to disagg_nodes.env. +# PARTITION empty -> first partition with >=2 idle GPU nodes (override to pin). +# PREFILL_NODE/DECODE_NODE empty -> assigned by the allocation; set BOTH to force +# salloc --nodelist=,. +export PARTITION="${PARTITION:-}" +export PREFILL_NODE="${PREFILL_NODE:-}" +export DECODE_NODE="${DECODE_NODE:-}" +# B200-specific: 8 GPUs/node, TP across the whole node. Hardcoded on purpose. +export GPUS_PER_NODE="${GPUS_PER_NODE:-8}" +export TP="${TP:-8}" +export ALLOC_NAME="${ALLOC_NAME:-pd_disagg}" +export ALLOC_TIME="${ALLOC_TIME:-04:00:00}" # allocation walltime (setup+startup+sweep) + +# ---- Shared paths: ENROOT_DIR/MODELS_ROOT must be on a CROSS-NODE-SHARED FS (preflight +# verifies). Default to $HOME (shared on most Slurm/NFS/weka homes); override if not. ---- +export ENROOT_DIR="${ENROOT_DIR:-$HOME/enroot}" +export SQSH="${SQSH:-$ENROOT_DIR/sglang-dev-cu13.sqsh}" # shared squashfs image +export DOCKER_IMAGE="${DOCKER_IMAGE:-docker://lmsysorg/sglang:dev-cu13}" +export MODELS_ROOT="${MODELS_ROOT:-$HOME/models}" +export MODEL_DIR="${MODEL_DIR:-$MODELS_ROOT/qwen3-32b}" +export MODEL_HF_ID="${MODEL_HF_ID:-Qwen/Qwen3-32B}" +export SERVED_NAME="${SERVED_NAME:-qwen3-32b}" +export HF_CACHE="${HF_CACHE:-$HOME/.cache/huggingface}" +export RESULTS_DIR="${RESULTS_DIR:-$ENROOT_DIR/sweep}" +export LOG_DIR="${LOG_DIR:-$ENROOT_DIR}" + +# ---- enroot import temp: MUST be a node-local NON-overlay fs (overlay / can't mknod the +# overlayfs whiteouts). EMPTY = auto-detected per node by 00_setup.sh (first writable +# ext4/xfs/btrfs among /scratch /raid /var/tmp /tmp ...). Set to pin a specific path. ---- +export ENROOT_SCRATCH="${ENROOT_SCRATCH:-}" + +# ---- Serving ---- +export PORT="${PORT:-30000}" # prefill/decode OpenAI port (each on its own node) +export BOOTSTRAP_PORT="${BOOTSTRAP_PORT:-9000}" # prefill KV bootstrap server +export ROUTER_PORT="${ROUTER_PORT:-8002}" # sgl-router (on prefill node) +# GPU-adjacent IB devices for --disaggregation-ib-device, in GPU order. +# EMPTY = auto-detect in 01_preflight.sh (nvidia-smi topo -m + /sys link_layer); +# set explicitly to override (e.g. "mlx5_9,mlx5_10,..."). Resolved value lands in +# $LOG_DIR/disagg_detected.env. On this box auto-detect yields mlx5_9,10,11,12,4,5,6,7. +export IB_DEVICES="${IB_DEVICES:-}" +# KV-cache GPU-mem registration path. EMPTY = auto-decide in 01_preflight.sh: +# nvidia_peermem present -> leave unset (peermem path); absent + driver>=535 -> +# set 0 (force Mooncake dmabuf path). Set explicitly to override. +export WITH_NVIDIA_PEERMEM="${WITH_NVIDIA_PEERMEM:-}" +# CUDA graph is MANDATORY for perf (eager = ~22x slower, 65% client timeouts). +export DISABLE_CUDA_GRAPH="${DISABLE_CUDA_GRAPH:-0}" + +# ---- Benchmark ---- +export ISL="${ISL:-1024}" +export OSL="${OSL:-1024}" +export CONC_LIST="${CONC_LIST:-16 64 128 256}" +export PROMPTS_PER_CONC="${PROMPTS_PER_CONC:-8}" # num_prompts = conc * this (min 160) +export REPO_ROOT="${REPO_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}" + +# ---- Node/IP resolution lives in disagg_lib.sh (resolve_nodes) + persists to +# $LOG_DIR/disagg_nodes.env; scripts call load_resolved() after sourcing this file. +# PREFILL_IP/DECODE_IP are filled there once nodes are known. ---- +export PREFILL_IP="${PREFILL_IP:-}" +export DECODE_IP="${DECODE_IP:-}" diff --git a/together_runner/slurm-disagg/disagg_lib.sh b/together_runner/slurm-disagg/disagg_lib.sh new file mode 100755 index 000000000..f11f99575 --- /dev/null +++ b/together_runner/slurm-disagg/disagg_lib.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# Shared helpers for slurm-disagg. Source AFTER config.env. + +trlog() { echo "[disagg $(date +%H:%M:%S)] $*"; } +trerr() { echo "[disagg $(date +%H:%M:%S)] ERROR: $*" >&2; } + +# Source persisted node/allocation resolution (ALLOC_JOB + nodes + IPs). +load_nodes() { [[ -f "$LOG_DIR/disagg_nodes.env" ]] && source "$LOG_DIR/disagg_nodes.env"; return 0; } + +# load_nodes + the detected RDMA config (IB_DEVICES + WITH_NVIDIA_PEERMEM). Used by +# launch/benchmark. NOT used by 01_preflight (which *produces* detected.env and must +# re-derive from clean config defaults each run). +load_resolved() { + load_nodes + [[ -f "$LOG_DIR/disagg_detected.env" ]] && source "$LOG_DIR/disagg_detected.env" + return 0 +} + +# Routable IP for a Slurm node: hostname resolution first, then Slurm's NodeAddr. +ip_of() { + local ip; ip="$(getent hosts "$1" 2>/dev/null | awk '{print $1; exit}')" + [[ -z "$ip" ]] && ip="$(scontrol show node "$1" 2>/dev/null | grep -oE 'NodeAddr=[^ ]+' | cut -d= -f2)" + echo "$ip" +} + +# True if Slurm job $1 is currently allocated (pending/running/configuring). +alloc_alive() { [[ -n "${1:-}" ]] && squeue -h -j "$1" -o '%t' 2>/dev/null | grep -qE 'R|PD|CF'; } + +# Pick PARTITION if unset: first partition with >=2 fully-idle GPU nodes. +resolve_partition() { + [[ -n "$PARTITION" ]] && return 0 + declare -A cnt + local node part gres st g + while IFS='|' read -r node part gres st; do + part="${part%\*}"; [[ "$st" == "idle" ]] || continue + g="$(grep -oE 'gpu(:[^:,]+)*:[0-9]+' <<<"$gres" | grep -oE '[0-9]+$' | head -1)" + [[ -n "$g" && "$g" -gt 0 ]] && cnt[$part]=$(( ${cnt[$part]:-0} + 1 )) + done < <(sinfo -h -N -o '%N|%P|%G|%t' 2>/dev/null) + for part in "${!cnt[@]}"; do [[ ${cnt[$part]} -ge 2 ]] && { PARTITION="$part"; break; }; done + [[ -n "$PARTITION" ]] || { trerr "no partition with >=2 idle GPU nodes (set PARTITION)"; return 1; } + trlog "auto-selected PARTITION=$PARTITION" +} + +# Ensure a live 2-node allocation; set ALLOC_JOB/PREFILL_NODE/DECODE_NODE/IPs and persist +# to $LOG_DIR/disagg_nodes.env. Idempotent: reuses the persisted allocation if still alive. +ensure_allocation() { + if alloc_alive "${ALLOC_JOB:-}"; then trlog "reusing allocation $ALLOC_JOB"; else + resolve_partition || return 1 + local nlflag="" + [[ -n "$PREFILL_NODE" && -n "$DECODE_NODE" ]] && nlflag="--nodelist=$PREFILL_NODE,$DECODE_NODE" + trlog "requesting 2-node allocation on '$PARTITION' (gpu:$GPUS_PER_NODE/node, t=$ALLOC_TIME) ..." + local out; out="$(salloc -N2 --gres=gpu:"$GPUS_PER_NODE" -p "$PARTITION" --no-shell \ + -J "$ALLOC_NAME" --time="$ALLOC_TIME" $nlflag 2>&1)" || { trerr "salloc failed: $out"; return 1; } + ALLOC_JOB="$(grep -oE 'job allocation [0-9]+' <<<"$out" | grep -oE '[0-9]+' | head -1)" + [[ -n "$ALLOC_JOB" ]] || { trerr "could not parse job id from: $out"; return 1; } + fi + # Discover the assigned nodes from the allocation (Slurm-ordered). + local nl; nl="$(squeue -h -j "$ALLOC_JOB" -o '%N' 2>/dev/null)" + mapfile -t NODES < <(scontrol show hostnames "$nl" 2>/dev/null) + [[ ${#NODES[@]} -ge 2 ]] || { trerr "allocation $ALLOC_JOB has <2 nodes ($nl)"; return 1; } + PREFILL_NODE="${PREFILL_NODE:-${NODES[0]}}" + DECODE_NODE="${DECODE_NODE:-${NODES[1]}}" + PREFILL_IP="$(ip_of "$PREFILL_NODE")"; DECODE_IP="$(ip_of "$DECODE_NODE")" + [[ -n "$PREFILL_IP" && -n "$DECODE_IP" ]] || { trerr "could not resolve node IPs ($PREFILL_NODE/$DECODE_NODE)"; return 1; } + + mkdir -p "$LOG_DIR" + cat > "$LOG_DIR/disagg_nodes.env" < [extra srun args] -- +# (kept as a helper; scripts mostly inline srun --jobid=$ALLOC_JOB --overlap for clarity.) + +# Common pyxis container-mounts. Pass extra mounts as $1 (comma-prefixed or empty). +# /dev/infiniband is REQUIRED for RDMA-to-pod (KV transfer). HF cache + model + logs. +container_mounts() { + local extra="${1:-}" + echo "${MODEL_DIR}:${MODEL_DIR},${LOG_DIR}:${LOG_DIR},${HF_CACHE}:/root/.cache/huggingface,/dev/infiniband:/dev/infiniband${extra:+,$extra}" +} + +# Poll an HTTP /health until 200, a process/job dies, or timeout. +# wait_health [] +wait_health() { + local url="$1" timeout="${2:-1800}" watch_job="${3:-}" t0 i=0 + t0=$(date +%s) + while :; do + if curl -sf -m4 "$url" >/dev/null 2>&1; then return 0; fi + if [[ -n "$watch_job" ]] && ! squeue -h -j "$watch_job" >/dev/null 2>&1; then + trerr "watched job $watch_job exited before $url became healthy"; return 1 + fi + (( $(date +%s) - t0 > timeout )) && { trerr "timeout waiting for $url"; return 1; } + sleep 5; (( i++ )) + done +} + +# cuda-graph flag: by default ON (empty); DISABLE_CUDA_GRAPH=1 adds the flag. +cuda_graph_arg() { [[ "${DISABLE_CUDA_GRAPH:-0}" == "1" ]] && echo "--disable-cuda-graph" || true; } diff --git a/together_runner/slurm-disagg/run_all.sh b/together_runner/slurm-disagg/run_all.sh new file mode 100755 index 000000000..7158b713b --- /dev/null +++ b/together_runner/slurm-disagg/run_all.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# One-shot end-to-end: setup (idempotent) -> launch -> benchmark sweep. +# Leaves the endpoint running; run teardown.sh when done. +# +# bash run_all.sh +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +bash "$HERE/00_setup.sh" +bash "$HERE/01_preflight.sh" +bash "$HERE/10_launch.sh" +bash "$HERE/20_benchmark.sh" +echo +echo "End-to-end done. Endpoint still up — see $(. "$HERE/config.env"; echo "$LOG_DIR/disagg_state.env")." +echo "Teardown: bash $HERE/teardown.sh" diff --git a/together_runner/slurm-disagg/teardown.sh b/together_runner/slurm-disagg/teardown.sh new file mode 100755 index 000000000..075451a1f --- /dev/null +++ b/together_runner/slurm-disagg/teardown.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Tear down the disagg endpoint: cancel the 2-node allocation (its prefill/decode/router +# steps die with it). Removes the node + state files so the next run re-allocates fresh. +# (image, weights, detected RDMA config preserved.) +# +# bash teardown.sh +set -uo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$HERE/config.env"; source "$HERE/disagg_lib.sh" +load_resolved + +JOB="${ALLOC_JOB:-}" +[[ -z "$JOB" ]] && JOB="$(squeue --me -h -n "$ALLOC_NAME" -o '%i' 2>/dev/null | head -1)" +if [[ -n "$JOB" ]]; then + trlog "scancel allocation $JOB"; scancel "$JOB" 2>/dev/null || true +else + trlog "no allocation found to cancel." +fi +rm -f "$LOG_DIR/disagg_state.env" "$LOG_DIR/disagg_nodes.env" +trlog "torn down. (image, weights, detected.env preserved)"