togethercomputer · Johnsonms · Jun 30, 2026 · Jun 30, 2026
diff --git a/together_runner/slurm-disagg/00_setup.sh b/together_runner/slurm-disagg/00_setup.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+# One-time (per pod boot) setup: grab the 2-node allocation, apply ephemeral node fixes,
+# import the image, prestage weights. Idempotent — safe to re-run (reuses a live
+# allocation). Needs sudo for the enroot nvidia-hook patch.
+#
+#   bash 00_setup.sh
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_resolved                       # reuse a persisted live allocation if present
+ensure_allocation || exit 1         # ALLOC_JOB + PREFILL_NODE/DECODE_NODE now set
+STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL"
+
+# 1) Patch the enroot nvidia hook on BOTH nodes (skip the persistenced/fabricmanager
+#    sockets that can't be bind-mounted inside the nested pod). Ephemeral; redo per boot.
+#    NOTE: this sed-patch of the *system* hook is unavoidable on this stack — a clean
+#    user-level override is not possible here (verified on enroot 4.0.1): runtime.sh runs
+#    the system AND user hooks.d with no basename dedup, so a user copy can't replace the
+#    system 98-nvidia.sh; and pyxis ignores a per-job ENROOT_SYSCONF_PATH redirect.
+trlog "patching enroot nvidia hook on $PREFILL_NODE,$DECODE_NODE ..."
+$STEP -N2 --ntasks-per-node=1 -w "$PREFILL_NODE,$DECODE_NODE" bash -c '
+  set -e; H=$(hostname); F=/etc/enroot/hooks.d/98-nvidia.sh
+  if grep -q "no-persistenced" "$F"; then echo "[$H] hook already patched"; exit 0; fi
+  sudo -n true 2>/dev/null || { echo "[$H] ERROR: sudo unavailable — cannot patch $F"; exit 1; }
+  sudo cp "$F" "$F.bak"
+  sudo sed -i "s|cli_args=(\"--no-cgroups\" |cli_args=(\"--no-cgroups\" \"--no-persistenced\" \"--no-fabricmanager\" |" "$F"
+  if grep -q "no-persistenced" "$F"; then echo "[$H] hook patched"; else
+    echo "[$H] ERROR: patch did not take (hook format changed?) — restoring backup"; sudo cp "$F.bak" "$F"; exit 1
+  fi'
+
+# 2) Import the SGLang image to the shared squashfs. enroot import needs a node-local
+#    NON-overlay fs for its temp (overlay / can't mknod the overlayfs whiteouts) — the
+#    step auto-detects one (honors $ENROOT_SCRATCH if set).
+if [[ -f "$SQSH" ]]; then
+    trlog "image already imported: $SQSH"
+else
+    trlog "importing $DOCKER_IMAGE -> $SQSH (auto-detect ext4 temp; multi-GB, ~minutes) ..."
+    mkdir -p "$ENROOT_DIR"
+    $STEP -N1 -w "$PREFILL_NODE" bash -c '
+      set -e
+      S=""
+      for c in "$ENROOT_SCRATCH" /scratch /raid /mnt/local /mnt/resource /var/tmp /tmp; do
+        [ -n "$c" ] || continue
+        [ -d "$c" ] || mkdir -p "$c" 2>/dev/null || continue
+        t=$(stat -f -c %T "$c" 2>/dev/null)
+        case "$t" in overlayfs|overlay|tmpfs|"") continue;; esac
+        mkdir -p "$c/enroot" 2>/dev/null || continue
+        S="$c/enroot"; break
+      done
+      [ -n "$S" ] || { echo "[$(hostname)] ERROR: no node-local non-overlay scratch (set ENROOT_SCRATCH)"; exit 1; }
+      export ENROOT_CACHE_PATH="$S/cache" ENROOT_TEMP_PATH="$S/tmp" TMPDIR="$S/tmp"
+      mkdir -p "$ENROOT_CACHE_PATH" "$ENROOT_TEMP_PATH"
+      echo "[$(hostname)] enroot temp on $S ($t)"
+      enroot import -o "$SQSH" "$DOCKER_IMAGE"'
+fi
+
+# 3) Prestage weights to the shared FS (zero-download launches).
+if [[ -f "$MODEL_DIR/config.json" ]]; then
+    trlog "weights present: $MODEL_DIR"
+else
+    trlog "downloading $MODEL_HF_ID -> $MODEL_DIR ..."
+    mkdir -p "$MODEL_DIR"
+    $STEP -N1 -w "$DECODE_NODE" \
+      --container-image="$SQSH" \
+      --container-mounts="$HF_CACHE:/root/.cache/huggingface,$MODELS_ROOT:$MODELS_ROOT" \
+      bash -c 'export HF_TOKEN=$(cat /root/.cache/huggingface/token 2>/dev/null)
+               hf download "$MODEL_HF_ID" --local-dir "$MODEL_DIR"'
+fi
+trlog "setup complete. (allocation $ALLOC_JOB held; next: bash 01_preflight.sh)"
diff --git a/together_runner/slurm-disagg/01_preflight.sh b/together_runner/slurm-disagg/01_preflight.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# Preflight: auto-detect + verify the RDMA/KV path BEFORE launching servers, so a
+# new cluster fails in seconds with a clear reason instead of after a multi-minute
+# server startup. Resolves IB_DEVICES (GPU<->NIC topology) and the
+# WITH_NVIDIA_PEERMEM decision (peermem vs dmabuf), checks IB port state on both
+# nodes, and verifies the /dev/infiniband bind-mount + dmabuf inside the container.
+# Writes the resolved values to $LOG_DIR/disagg_detected.env (sourced by 10_launch.sh).
+#
+#   bash 01_preflight.sh            # detect + check, write detected.env
+#   PROBE_MOONCAKE=1 bash 01_preflight.sh   # + heavy mooncake register_memory probe
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_nodes   # nodes/ALLOC_JOB only — re-derive IB/peermem from clean config each run
+alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; }
+DETECTED="$LOG_DIR/disagg_detected.env"
+SR="srun --jobid=$ALLOC_JOB --overlap --export=ALL -t 5"
+fail=0
+
+# --- 1. IB ports ACTIVE/LinkUp on BOTH nodes (host-side, fast) ---
+trlog "checking IB port state on $PREFILL_NODE,$DECODE_NODE ..."
+$SR -N2 --ntasks-per-node=1 --nodelist="$PREFILL_NODE,$DECODE_NODE" bash -c '
+  n=0; act=0
+  for d in /sys/class/infiniband/*; do
+    [ -e "$d/ports/1/state" ] || continue; n=$((n+1))
+    s=$(cat "$d/ports/1/state"); p=$(cat "$d/ports/1/phys_state")
+    [[ "$s" == *ACTIVE* && "$p" == *LinkUp* ]] && act=$((act+1))
+  done
+  echo "[$(hostname)] IB HCAs: $act/$n ACTIVE+LinkUp"
+  [ "$act" -gt 0 ] || { echo "[$(hostname)] ERROR: no ACTIVE IB port"; exit 1; }
+' || { trerr "IB port check failed"; fail=1; }
+
+# --- 2. Resolve IB_DEVICES (topology) + container RDMA check on prefill node ---
+CHK_OUT="$LOG_DIR/.preflight_chk.out"
+trlog "detecting GPU<->NIC topology + checking RDMA inside container on $PREFILL_NODE ..."
+$SR -N1 --nodelist="$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \
+  --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
+  bash -c "
+    echo '### DETECT ###'
+    python3 /inferencex/together_runner/slurm-disagg/_detect_rdma.py
+    echo '### CHECK ###'
+    python3 /inferencex/together_runner/slurm-disagg/_check_container_rdma.py ${PROBE_MOONCAKE:+--mooncake}
+  " 2>&1 | tee "$CHK_OUT" || { trerr "container RDMA check failed (see above)"; fail=1; }
+
+# Pull machine-readable values out of the captured output.
+DET_IB="$(grep -m1 '^IB_DEVICES=' "$CHK_OUT" | cut -d= -f2- || true)"
+DET_LL="$(grep -m1 '^IB_LINK_LAYER=' "$CHK_OUT" | cut -d= -f2- || true)"
+DMABUF="$(grep -m1 '^DMABUF_SUPPORTED=' "$CHK_OUT" | cut -d= -f2- || true)"
+
+# Honor an explicit override; else use the detected list.
+IB_FINAL="${IB_DEVICES:-$DET_IB}"
+[[ -n "$IB_FINAL" ]] || { trerr "could not resolve IB_DEVICES (set it explicitly in config.env)"; fail=1; }
+[[ -n "${IB_DEVICES:-}" && -n "$DET_IB" && "$IB_DEVICES" != "$DET_IB" ]] && \
+  trlog "NOTE: explicit IB_DEVICES ($IB_DEVICES) differs from detected ($DET_IB) — using explicit."
+
+# --- 3. peermem vs dmabuf decision (host-side: module presence + driver version) ---
+PEERMEM_FINAL="${WITH_NVIDIA_PEERMEM:-}"
+if [[ -z "$PEERMEM_FINAL" ]]; then
+  trlog "deciding KV mem-registration path (nvidia_peermem vs dmabuf) ..."
+  DEC="$($SR -N1 --nodelist="$PREFILL_NODE" bash -c '
+    drv=$(cat /sys/module/nvidia/version 2>/dev/null); maj=${drv%%.*}
+    if modinfo nvidia_peermem >/dev/null 2>&1; then echo "peermem $drv"; else echo "dmabuf $drv $maj"; fi')"
+  read -r MODE DRV MAJ <<<"$DEC"
+  if [[ "$MODE" == "peermem" ]]; then
+    trlog "nvidia_peermem AVAILABLE (driver $DRV) — using default peermem path (WITH_NVIDIA_PEERMEM unset)."
+  else
+    if [[ "${MAJ:-0}" -ge 535 ]]; then
+      PEERMEM_FINAL=0
+      trlog "nvidia_peermem ABSENT (driver $DRV ≥535) — forcing dmabuf (WITH_NVIDIA_PEERMEM=0)."
+      [[ "$DMABUF" == "1" ]] || { trerr "dmabuf chosen but libibverbs lacks ibv_reg_dmabuf_mr — KV transfer will fail"; fail=1; }
+    else
+      trerr "nvidia_peermem ABSENT and driver $DRV <535 (no dmabuf) — KV transfer cannot register GPU mem"; fail=1
+    fi
+  fi
+else
+  trlog "WITH_NVIDIA_PEERMEM explicitly set to '$PEERMEM_FINAL' — honoring it."
+fi
+
+# --- 4. write the resolved truth source ---
+if [[ "$fail" == "0" ]]; then
+  mkdir -p "$LOG_DIR"
+  cat > "$DETECTED" <<EOF
+# Auto-resolved by 01_preflight.sh — sourced by 10_launch.sh. Re-run preflight to refresh.
+IB_DEVICES=$IB_FINAL
+IB_LINK_LAYER=${DET_LL:-unknown}
+WITH_NVIDIA_PEERMEM=$PEERMEM_FINAL
+EOF
+  rm -f "$CHK_OUT"
+  trlog "PREFLIGHT OK → $DETECTED"
+  trlog "  IB_DEVICES=$IB_FINAL"
+  trlog "  WITH_NVIDIA_PEERMEM=${PEERMEM_FINAL:-<unset:peermem>}"
+  trlog "next: bash 10_launch.sh"
+else
+  trerr "PREFLIGHT FAILED — fix the above before launching (detected.env NOT written)."; exit 1
+fi
diff --git a/together_runner/slurm-disagg/10_launch.sh b/together_runner/slurm-disagg/10_launch.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+# Launch the 1P1D disaggregated endpoint as overlap steps INTO the 2-node allocation:
+# prefill@PREFILL_NODE + decode@DECODE_NODE + sgl-router@PREFILL_NODE. Waits for health
+# and writes a state file (endpoint) for benchmark/teardown.
+#
+#   bash 10_launch.sh
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_resolved
+alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; }
+STATE="$LOG_DIR/disagg_state.env"
+
+# Resolved RDMA config (IB_DEVICES + WITH_NVIDIA_PEERMEM) comes from preflight.
+DETECTED="$LOG_DIR/disagg_detected.env"
+[[ -f "$DETECTED" ]] || { trlog "no $DETECTED — running 01_preflight.sh first ..."; bash "$HERE/01_preflight.sh"; }
+source "$DETECTED"
+[[ -n "${IB_DEVICES:-}" ]] || { trerr "IB_DEVICES unresolved after preflight"; exit 1; }
+
+MOUNTS="$(container_mounts)"
+CG="$(cuda_graph_arg)"
+# peermem path: empty => default (peermem); set => prefix WITH_NVIDIA_PEERMEM=<val> (dmabuf when 0).
+PEERMEM_PREFIX=""; [[ -n "${WITH_NVIDIA_PEERMEM:-}" ]] && PEERMEM_PREFIX="WITH_NVIDIA_PEERMEM=$WITH_NVIDIA_PEERMEM "
+STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL"
+trlog "RDMA: IB_DEVICES=$IB_DEVICES  peermem=${WITH_NVIDIA_PEERMEM:-<unset>}  alloc=$ALLOC_JOB"
+
+# --- prefill (with KV bootstrap server) ---
+trlog "launching prefill on $PREFILL_NODE (TP$TP, cuda_graph=$([[ -z $CG ]] && echo on || echo off)) ..."
+: > "$LOG_DIR/prefill.log"
+nohup $STEP -N1 -w "$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \
+  --container-image="$SQSH" --container-mounts="$MOUNTS" \
+  bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \
+    --model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \
+    --host 0.0.0.0 --port $PORT --trust-remote-code \
+    --disaggregation-mode prefill --disaggregation-bootstrap-port $BOOTSTRAP_PORT \
+    --disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/prefill.log 2>&1" >/dev/null 2>&1 &
+
+# --- decode ---
+trlog "launching decode on $DECODE_NODE (TP$TP) ..."
+: > "$LOG_DIR/decode.log"
+nohup $STEP -N1 -w "$DECODE_NODE" --gres=gpu:$GPUS_PER_NODE \
+  --container-image="$SQSH" --container-mounts="$MOUNTS" \
+  bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \
+    --model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \
+    --host 0.0.0.0 --port $PORT --trust-remote-code \
+    --disaggregation-mode decode \
+    --disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/decode.log 2>&1" >/dev/null 2>&1 &
+
+trlog "waiting for both servers to be healthy (cuda-graph capture can take several minutes) ..."
+wait_health "http://$PREFILL_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "prefill never healthy; see $LOG_DIR/prefill.log"; exit 1; }
+trlog "prefill healthy."
+wait_health "http://$DECODE_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "decode never healthy; see $LOG_DIR/decode.log"; exit 1; }
+trlog "decode healthy."
+
+# --- router: another overlap step on the prefill node. MUST mount the model dir so the
+#     tokenizer loads locally (else 404s to HF). ---
+trlog "launching sgl-router on $PREFILL_NODE:$ROUTER_PORT ..."
+: > "$LOG_DIR/router.log"
+nohup $STEP -N1 -w "$PREFILL_NODE" \
+  --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
+  bash -c "python3 -m sglang_router.launch_router --pd-disaggregation \
+    --prefill http://$PREFILL_IP:$PORT $BOOTSTRAP_PORT --decode http://$DECODE_IP:$PORT \
+    --host 0.0.0.0 --port $ROUTER_PORT --policy random > $LOG_DIR/router.log 2>&1" >/dev/null 2>&1 &
+
+wait_health "http://$PREFILL_IP:$ROUTER_PORT/health" 120 "$ALLOC_JOB" || { trerr "router never healthy; see $LOG_DIR/router.log"; exit 1; }
+
+cat > "$STATE" <<EOF
+ALLOC_JOB=$ALLOC_JOB
+ENDPOINT=http://$PREFILL_IP:$ROUTER_PORT
+EOF
+trlog "ENDPOINT READY: http://$PREFILL_IP:$ROUTER_PORT  (state: $STATE)"
+trlog "next: bash 20_benchmark.sh   |   teardown: bash teardown.sh"
diff --git a/together_runner/slurm-disagg/20_benchmark.sh b/together_runner/slurm-disagg/20_benchmark.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Concurrency sweep against the live disagg endpoint using the InferenceX unified
+# client (utils/bench_serving/benchmark_serving.py). Reuses one warm router.
+# Saves a result JSON per concurrency under RESULTS_DIR and prints a summary table.
+#
+#   bash 20_benchmark.sh
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_resolved
+STATE="$LOG_DIR/disagg_state.env"
+[[ -f "$STATE" ]] || { trerr "no state file ($STATE) — run 10_launch.sh first"; exit 1; }
+source "$STATE"
+alloc_alive "${ALLOC_JOB:-}" || { trerr "allocation $ALLOC_JOB gone — relaunch"; exit 1; }
+mkdir -p "$RESULTS_DIR"
+
+curl -sf -m4 "$ENDPOINT/health" >/dev/null || { trerr "endpoint $ENDPOINT not healthy"; exit 1; }
+SEQTAG="$(( ISL/1024 ))k$(( OSL/1024 ))k"
+trlog "sweep CONC_LIST='$CONC_LIST' at ISL=$ISL OSL=$OSL via $ENDPOINT"
+
+# Run the bench client inside the image (mount repo at /inferencex + model for tokenizer),
+# as an overlap step on the prefill allocation. One srun does the whole sweep.
+nohup srun --jobid="$ALLOC_JOB" --overlap --nodelist="$PREFILL_NODE" \
+  --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
+  bash -c "
+    pip install -q datasets pandas >/dev/null 2>&1 || true
+    for C in $CONC_LIST; do
+      NP=\$(( C * $PROMPTS_PER_CONC )); [ \$NP -lt 160 ] && NP=160
+      echo \"############ conc=\$C num_prompts=\$NP ############\"
+      python3 /inferencex/utils/bench_serving/benchmark_serving.py \
+        --backend sglang --model $SERVED_NAME --tokenizer $MODEL_DIR \
+        --base-url $ENDPOINT --endpoint /v1/completions \
+        --dataset-name random --random-input-len $ISL --random-output-len $OSL \
+        --max-concurrency \$C --num-prompts \$NP --percentile-metrics ttft,tpot,itl,e2el \
+        --save-result --result-dir $RESULTS_DIR \
+        --result-filename ${SERVED_NAME}_${SEQTAG}_conc\${C}.bench.json 2>&1 \
+        | grep -E 'Successful requests|Total Token throughput|Output token throughput|Median TTFT|P99 TTFT|Median TPOT|Median E2EL'
+      echo \"=== conc=\$C done ===\"
+    done
+    echo ALL_SWEEP_DONE" 2>&1 | tee "$LOG_DIR/sweep.log"
+
+# Summary table from the result JSONs.
+trlog "==== SWEEP SUMMARY ($SEQTAG) ===="
+python3 - "$RESULTS_DIR" "$SERVED_NAME" "$SEQTAG" $CONC_LIST <<'PY'
+import json, sys, os
+rdir, name, seqtag = sys.argv[1], sys.argv[2], sys.argv[3]
+concs = sys.argv[4:]
+hdr = f"{'conc':>5} {'ok':>11} {'total tok/s':>12} {'out tok/s':>10} {'mTPOT ms':>9} {'mTTFT ms':>9} {'p99TTFT ms':>11}"
+print(hdr); print('-'*len(hdr))
+for C in concs:
+    f = os.path.join(rdir, f"{name}_{seqtag}_conc{C}.bench.json")
+    if not os.path.exists(f): print(f"{C:>5}  (missing)"); continue
+    d = json.load(open(f))
+    ok = f"{d.get('completed')}/{d.get('num_prompts','?')}"
+    print(f"{C:>5} {ok:>11} {d.get('total_token_throughput',0):>12.0f} {d.get('output_throughput',0):>10.0f} "
+          f"{d.get('median_tpot_ms',0):>9.1f} {d.get('median_ttft_ms',0):>9.0f} {d.get('p99_ttft_ms',0):>11.0f}")
+PY
+trlog "raw results: $RESULTS_DIR  | sweep log: $LOG_DIR/sweep.log"