Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions together_runner/slurm-disagg/00_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env bash
# One-time (per pod boot) setup: grab the 2-node allocation, apply ephemeral node fixes,
# import the image, prestage weights. Idempotent — safe to re-run (reuses a live
# allocation). Needs sudo for the enroot nvidia-hook patch.
#
# bash 00_setup.sh
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
load_resolved # reuse a persisted live allocation if present
ensure_allocation || exit 1 # ALLOC_JOB + PREFILL_NODE/DECODE_NODE now set
STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL"

# 1) Patch the enroot nvidia hook on BOTH nodes (skip the persistenced/fabricmanager
# sockets that can't be bind-mounted inside the nested pod). Ephemeral; redo per boot.
# NOTE: this sed-patch of the *system* hook is unavoidable on this stack — a clean
# user-level override is not possible here (verified on enroot 4.0.1): runtime.sh runs
# the system AND user hooks.d with no basename dedup, so a user copy can't replace the
# system 98-nvidia.sh; and pyxis ignores a per-job ENROOT_SYSCONF_PATH redirect.
trlog "patching enroot nvidia hook on $PREFILL_NODE,$DECODE_NODE ..."
$STEP -N2 --ntasks-per-node=1 -w "$PREFILL_NODE,$DECODE_NODE" bash -c '
set -e; H=$(hostname); F=/etc/enroot/hooks.d/98-nvidia.sh
if grep -q "no-persistenced" "$F"; then echo "[$H] hook already patched"; exit 0; fi
sudo -n true 2>/dev/null || { echo "[$H] ERROR: sudo unavailable — cannot patch $F"; exit 1; }
sudo cp "$F" "$F.bak"
sudo sed -i "s|cli_args=(\"--no-cgroups\" |cli_args=(\"--no-cgroups\" \"--no-persistenced\" \"--no-fabricmanager\" |" "$F"
if grep -q "no-persistenced" "$F"; then echo "[$H] hook patched"; else
echo "[$H] ERROR: patch did not take (hook format changed?) — restoring backup"; sudo cp "$F.bak" "$F"; exit 1
fi'

# 2) Import the SGLang image to the shared squashfs. enroot import needs a node-local
# NON-overlay fs for its temp (overlay / can't mknod the overlayfs whiteouts) — the
# step auto-detects one (honors $ENROOT_SCRATCH if set).
if [[ -f "$SQSH" ]]; then
trlog "image already imported: $SQSH"
else
trlog "importing $DOCKER_IMAGE -> $SQSH (auto-detect ext4 temp; multi-GB, ~minutes) ..."
mkdir -p "$ENROOT_DIR"
$STEP -N1 -w "$PREFILL_NODE" bash -c '
set -e
S=""
for c in "$ENROOT_SCRATCH" /scratch /raid /mnt/local /mnt/resource /var/tmp /tmp; do
[ -n "$c" ] || continue
[ -d "$c" ] || mkdir -p "$c" 2>/dev/null || continue
t=$(stat -f -c %T "$c" 2>/dev/null)
case "$t" in overlayfs|overlay|tmpfs|"") continue;; esac
mkdir -p "$c/enroot" 2>/dev/null || continue
S="$c/enroot"; break
done
[ -n "$S" ] || { echo "[$(hostname)] ERROR: no node-local non-overlay scratch (set ENROOT_SCRATCH)"; exit 1; }
export ENROOT_CACHE_PATH="$S/cache" ENROOT_TEMP_PATH="$S/tmp" TMPDIR="$S/tmp"
mkdir -p "$ENROOT_CACHE_PATH" "$ENROOT_TEMP_PATH"
echo "[$(hostname)] enroot temp on $S ($t)"
enroot import -o "$SQSH" "$DOCKER_IMAGE"'
fi

# 3) Prestage weights to the shared FS (zero-download launches).
if [[ -f "$MODEL_DIR/config.json" ]]; then
trlog "weights present: $MODEL_DIR"
else
trlog "downloading $MODEL_HF_ID -> $MODEL_DIR ..."
mkdir -p "$MODEL_DIR"
$STEP -N1 -w "$DECODE_NODE" \
--container-image="$SQSH" \
--container-mounts="$HF_CACHE:/root/.cache/huggingface,$MODELS_ROOT:$MODELS_ROOT" \
bash -c 'export HF_TOKEN=$(cat /root/.cache/huggingface/token 2>/dev/null)
hf download "$MODEL_HF_ID" --local-dir "$MODEL_DIR"'
fi
trlog "setup complete. (allocation $ALLOC_JOB held; next: bash 01_preflight.sh)"
95 changes: 95 additions & 0 deletions together_runner/slurm-disagg/01_preflight.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env bash
# Preflight: auto-detect + verify the RDMA/KV path BEFORE launching servers, so a
# new cluster fails in seconds with a clear reason instead of after a multi-minute
# server startup. Resolves IB_DEVICES (GPU<->NIC topology) and the
# WITH_NVIDIA_PEERMEM decision (peermem vs dmabuf), checks IB port state on both
# nodes, and verifies the /dev/infiniband bind-mount + dmabuf inside the container.
# Writes the resolved values to $LOG_DIR/disagg_detected.env (sourced by 10_launch.sh).
#
# bash 01_preflight.sh # detect + check, write detected.env
# PROBE_MOONCAKE=1 bash 01_preflight.sh # + heavy mooncake register_memory probe
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
load_nodes # nodes/ALLOC_JOB only — re-derive IB/peermem from clean config each run
alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; }
DETECTED="$LOG_DIR/disagg_detected.env"
SR="srun --jobid=$ALLOC_JOB --overlap --export=ALL -t 5"
fail=0

# --- 1. IB ports ACTIVE/LinkUp on BOTH nodes (host-side, fast) ---
trlog "checking IB port state on $PREFILL_NODE,$DECODE_NODE ..."
$SR -N2 --ntasks-per-node=1 --nodelist="$PREFILL_NODE,$DECODE_NODE" bash -c '
n=0; act=0
for d in /sys/class/infiniband/*; do
[ -e "$d/ports/1/state" ] || continue; n=$((n+1))
s=$(cat "$d/ports/1/state"); p=$(cat "$d/ports/1/phys_state")
[[ "$s" == *ACTIVE* && "$p" == *LinkUp* ]] && act=$((act+1))
done
echo "[$(hostname)] IB HCAs: $act/$n ACTIVE+LinkUp"
[ "$act" -gt 0 ] || { echo "[$(hostname)] ERROR: no ACTIVE IB port"; exit 1; }
' || { trerr "IB port check failed"; fail=1; }

# --- 2. Resolve IB_DEVICES (topology) + container RDMA check on prefill node ---
CHK_OUT="$LOG_DIR/.preflight_chk.out"
trlog "detecting GPU<->NIC topology + checking RDMA inside container on $PREFILL_NODE ..."
$SR -N1 --nodelist="$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \
--container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
bash -c "
echo '### DETECT ###'
python3 /inferencex/together_runner/slurm-disagg/_detect_rdma.py
echo '### CHECK ###'
python3 /inferencex/together_runner/slurm-disagg/_check_container_rdma.py ${PROBE_MOONCAKE:+--mooncake}
" 2>&1 | tee "$CHK_OUT" || { trerr "container RDMA check failed (see above)"; fail=1; }

# Pull machine-readable values out of the captured output.
DET_IB="$(grep -m1 '^IB_DEVICES=' "$CHK_OUT" | cut -d= -f2- || true)"
DET_LL="$(grep -m1 '^IB_LINK_LAYER=' "$CHK_OUT" | cut -d= -f2- || true)"
DMABUF="$(grep -m1 '^DMABUF_SUPPORTED=' "$CHK_OUT" | cut -d= -f2- || true)"

# Honor an explicit override; else use the detected list.
IB_FINAL="${IB_DEVICES:-$DET_IB}"
[[ -n "$IB_FINAL" ]] || { trerr "could not resolve IB_DEVICES (set it explicitly in config.env)"; fail=1; }
[[ -n "${IB_DEVICES:-}" && -n "$DET_IB" && "$IB_DEVICES" != "$DET_IB" ]] && \
trlog "NOTE: explicit IB_DEVICES ($IB_DEVICES) differs from detected ($DET_IB) — using explicit."

# --- 3. peermem vs dmabuf decision (host-side: module presence + driver version) ---
PEERMEM_FINAL="${WITH_NVIDIA_PEERMEM:-}"
if [[ -z "$PEERMEM_FINAL" ]]; then
trlog "deciding KV mem-registration path (nvidia_peermem vs dmabuf) ..."
DEC="$($SR -N1 --nodelist="$PREFILL_NODE" bash -c '
drv=$(cat /sys/module/nvidia/version 2>/dev/null); maj=${drv%%.*}
if modinfo nvidia_peermem >/dev/null 2>&1; then echo "peermem $drv"; else echo "dmabuf $drv $maj"; fi')"
read -r MODE DRV MAJ <<<"$DEC"
if [[ "$MODE" == "peermem" ]]; then
trlog "nvidia_peermem AVAILABLE (driver $DRV) — using default peermem path (WITH_NVIDIA_PEERMEM unset)."
else
if [[ "${MAJ:-0}" -ge 535 ]]; then
PEERMEM_FINAL=0
trlog "nvidia_peermem ABSENT (driver $DRV ≥535) — forcing dmabuf (WITH_NVIDIA_PEERMEM=0)."
[[ "$DMABUF" == "1" ]] || { trerr "dmabuf chosen but libibverbs lacks ibv_reg_dmabuf_mr — KV transfer will fail"; fail=1; }
else
trerr "nvidia_peermem ABSENT and driver $DRV <535 (no dmabuf) — KV transfer cannot register GPU mem"; fail=1
fi
fi
else
trlog "WITH_NVIDIA_PEERMEM explicitly set to '$PEERMEM_FINAL' — honoring it."
fi

# --- 4. write the resolved truth source ---
if [[ "$fail" == "0" ]]; then
mkdir -p "$LOG_DIR"
cat > "$DETECTED" <<EOF
# Auto-resolved by 01_preflight.sh — sourced by 10_launch.sh. Re-run preflight to refresh.
IB_DEVICES=$IB_FINAL
IB_LINK_LAYER=${DET_LL:-unknown}
WITH_NVIDIA_PEERMEM=$PEERMEM_FINAL
EOF
rm -f "$CHK_OUT"
trlog "PREFLIGHT OK → $DETECTED"
trlog " IB_DEVICES=$IB_FINAL"
trlog " WITH_NVIDIA_PEERMEM=${PEERMEM_FINAL:-<unset:peermem>}"
trlog "next: bash 10_launch.sh"
else
trerr "PREFLIGHT FAILED — fix the above before launching (detected.env NOT written)."; exit 1
fi
72 changes: 72 additions & 0 deletions together_runner/slurm-disagg/10_launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env bash
# Launch the 1P1D disaggregated endpoint as overlap steps INTO the 2-node allocation:
# prefill@PREFILL_NODE + decode@DECODE_NODE + sgl-router@PREFILL_NODE. Waits for health
# and writes a state file (endpoint) for benchmark/teardown.
#
# bash 10_launch.sh
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
load_resolved
alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; }
STATE="$LOG_DIR/disagg_state.env"

# Resolved RDMA config (IB_DEVICES + WITH_NVIDIA_PEERMEM) comes from preflight.
DETECTED="$LOG_DIR/disagg_detected.env"
[[ -f "$DETECTED" ]] || { trlog "no $DETECTED — running 01_preflight.sh first ..."; bash "$HERE/01_preflight.sh"; }
source "$DETECTED"
[[ -n "${IB_DEVICES:-}" ]] || { trerr "IB_DEVICES unresolved after preflight"; exit 1; }

MOUNTS="$(container_mounts)"
CG="$(cuda_graph_arg)"
# peermem path: empty => default (peermem); set => prefix WITH_NVIDIA_PEERMEM=<val> (dmabuf when 0).
PEERMEM_PREFIX=""; [[ -n "${WITH_NVIDIA_PEERMEM:-}" ]] && PEERMEM_PREFIX="WITH_NVIDIA_PEERMEM=$WITH_NVIDIA_PEERMEM "
STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL"
trlog "RDMA: IB_DEVICES=$IB_DEVICES peermem=${WITH_NVIDIA_PEERMEM:-<unset>} alloc=$ALLOC_JOB"

# --- prefill (with KV bootstrap server) ---
trlog "launching prefill on $PREFILL_NODE (TP$TP, cuda_graph=$([[ -z $CG ]] && echo on || echo off)) ..."
: > "$LOG_DIR/prefill.log"
nohup $STEP -N1 -w "$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \
--container-image="$SQSH" --container-mounts="$MOUNTS" \
bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \
--model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \
--host 0.0.0.0 --port $PORT --trust-remote-code \
--disaggregation-mode prefill --disaggregation-bootstrap-port $BOOTSTRAP_PORT \
--disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/prefill.log 2>&1" >/dev/null 2>&1 &

# --- decode ---
trlog "launching decode on $DECODE_NODE (TP$TP) ..."
: > "$LOG_DIR/decode.log"
nohup $STEP -N1 -w "$DECODE_NODE" --gres=gpu:$GPUS_PER_NODE \
--container-image="$SQSH" --container-mounts="$MOUNTS" \
bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \
--model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \
--host 0.0.0.0 --port $PORT --trust-remote-code \
--disaggregation-mode decode \
--disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/decode.log 2>&1" >/dev/null 2>&1 &

trlog "waiting for both servers to be healthy (cuda-graph capture can take several minutes) ..."
wait_health "http://$PREFILL_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "prefill never healthy; see $LOG_DIR/prefill.log"; exit 1; }
trlog "prefill healthy."
wait_health "http://$DECODE_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "decode never healthy; see $LOG_DIR/decode.log"; exit 1; }
trlog "decode healthy."

# --- router: another overlap step on the prefill node. MUST mount the model dir so the
# tokenizer loads locally (else 404s to HF). ---
trlog "launching sgl-router on $PREFILL_NODE:$ROUTER_PORT ..."
: > "$LOG_DIR/router.log"
nohup $STEP -N1 -w "$PREFILL_NODE" \
--container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
bash -c "python3 -m sglang_router.launch_router --pd-disaggregation \
--prefill http://$PREFILL_IP:$PORT $BOOTSTRAP_PORT --decode http://$DECODE_IP:$PORT \
--host 0.0.0.0 --port $ROUTER_PORT --policy random > $LOG_DIR/router.log 2>&1" >/dev/null 2>&1 &

wait_health "http://$PREFILL_IP:$ROUTER_PORT/health" 120 "$ALLOC_JOB" || { trerr "router never healthy; see $LOG_DIR/router.log"; exit 1; }

cat > "$STATE" <<EOF
ALLOC_JOB=$ALLOC_JOB
ENDPOINT=http://$PREFILL_IP:$ROUTER_PORT
EOF
trlog "ENDPOINT READY: http://$PREFILL_IP:$ROUTER_PORT (state: $STATE)"
trlog "next: bash 20_benchmark.sh | teardown: bash teardown.sh"
58 changes: 58 additions & 0 deletions together_runner/slurm-disagg/20_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Concurrency sweep against the live disagg endpoint using the InferenceX unified
# client (utils/bench_serving/benchmark_serving.py). Reuses one warm router.
# Saves a result JSON per concurrency under RESULTS_DIR and prints a summary table.
#
# bash 20_benchmark.sh
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
load_resolved
STATE="$LOG_DIR/disagg_state.env"
[[ -f "$STATE" ]] || { trerr "no state file ($STATE) — run 10_launch.sh first"; exit 1; }
source "$STATE"
alloc_alive "${ALLOC_JOB:-}" || { trerr "allocation $ALLOC_JOB gone — relaunch"; exit 1; }
mkdir -p "$RESULTS_DIR"

curl -sf -m4 "$ENDPOINT/health" >/dev/null || { trerr "endpoint $ENDPOINT not healthy"; exit 1; }
SEQTAG="$(( ISL/1024 ))k$(( OSL/1024 ))k"
trlog "sweep CONC_LIST='$CONC_LIST' at ISL=$ISL OSL=$OSL via $ENDPOINT"

# Run the bench client inside the image (mount repo at /inferencex + model for tokenizer),
# as an overlap step on the prefill allocation. One srun does the whole sweep.
nohup srun --jobid="$ALLOC_JOB" --overlap --nodelist="$PREFILL_NODE" \
--container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
bash -c "
pip install -q datasets pandas >/dev/null 2>&1 || true
for C in $CONC_LIST; do
NP=\$(( C * $PROMPTS_PER_CONC )); [ \$NP -lt 160 ] && NP=160
echo \"############ conc=\$C num_prompts=\$NP ############\"
python3 /inferencex/utils/bench_serving/benchmark_serving.py \
--backend sglang --model $SERVED_NAME --tokenizer $MODEL_DIR \
--base-url $ENDPOINT --endpoint /v1/completions \
--dataset-name random --random-input-len $ISL --random-output-len $OSL \
--max-concurrency \$C --num-prompts \$NP --percentile-metrics ttft,tpot,itl,e2el \
--save-result --result-dir $RESULTS_DIR \
--result-filename ${SERVED_NAME}_${SEQTAG}_conc\${C}.bench.json 2>&1 \
| grep -E 'Successful requests|Total Token throughput|Output token throughput|Median TTFT|P99 TTFT|Median TPOT|Median E2EL'
echo \"=== conc=\$C done ===\"
done
echo ALL_SWEEP_DONE" 2>&1 | tee "$LOG_DIR/sweep.log"

# Summary table from the result JSONs.
trlog "==== SWEEP SUMMARY ($SEQTAG) ===="
python3 - "$RESULTS_DIR" "$SERVED_NAME" "$SEQTAG" $CONC_LIST <<'PY'
import json, sys, os
rdir, name, seqtag = sys.argv[1], sys.argv[2], sys.argv[3]
concs = sys.argv[4:]
hdr = f"{'conc':>5} {'ok':>11} {'total tok/s':>12} {'out tok/s':>10} {'mTPOT ms':>9} {'mTTFT ms':>9} {'p99TTFT ms':>11}"
print(hdr); print('-'*len(hdr))
for C in concs:
f = os.path.join(rdir, f"{name}_{seqtag}_conc{C}.bench.json")
if not os.path.exists(f): print(f"{C:>5} (missing)"); continue
d = json.load(open(f))
ok = f"{d.get('completed')}/{d.get('num_prompts','?')}"
print(f"{C:>5} {ok:>11} {d.get('total_token_throughput',0):>12.0f} {d.get('output_throughput',0):>10.0f} "
f"{d.get('median_tpot_ms',0):>9.1f} {d.get('median_ttft_ms',0):>9.0f} {d.get('p99_ttft_ms',0):>11.0f}")
PY
trlog "raw results: $RESULTS_DIR | sweep log: $LOG_DIR/sweep.log"
Loading