diff --git a/together_runner/slurm-disagg/00_setup.sh b/together_runner/slurm-disagg/00_setup.sh
new file mode 100755
index 000000000..c23bbbc2a
--- /dev/null
+++ b/together_runner/slurm-disagg/00_setup.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+# One-time (per pod boot) setup: grab the 2-node allocation, apply ephemeral node fixes,
+# import the image, prestage weights. Idempotent — safe to re-run (reuses a live
+# allocation). Needs sudo for the enroot nvidia-hook patch.
+#
+#   bash 00_setup.sh
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_resolved                       # reuse a persisted live allocation if present
+ensure_allocation || exit 1         # ALLOC_JOB + PREFILL_NODE/DECODE_NODE now set
+STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL"
+
+# 1) Patch the enroot nvidia hook on BOTH nodes (skip the persistenced/fabricmanager
+#    sockets that can't be bind-mounted inside the nested pod). Ephemeral; redo per boot.
+#    NOTE: this sed-patch of the *system* hook is unavoidable on this stack — a clean
+#    user-level override is not possible here (verified on enroot 4.0.1): runtime.sh runs
+#    the system AND user hooks.d with no basename dedup, so a user copy can't replace the
+#    system 98-nvidia.sh; and pyxis ignores a per-job ENROOT_SYSCONF_PATH redirect.
+trlog "patching enroot nvidia hook on $PREFILL_NODE,$DECODE_NODE ..."
+$STEP -N2 --ntasks-per-node=1 -w "$PREFILL_NODE,$DECODE_NODE" bash -c '
+  set -e; H=$(hostname); F=/etc/enroot/hooks.d/98-nvidia.sh
+  if grep -q "no-persistenced" "$F"; then echo "[$H] hook already patched"; exit 0; fi
+  sudo -n true 2>/dev/null || { echo "[$H] ERROR: sudo unavailable — cannot patch $F"; exit 1; }
+  sudo cp "$F" "$F.bak"
+  sudo sed -i "s|cli_args=(\"--no-cgroups\" |cli_args=(\"--no-cgroups\" \"--no-persistenced\" \"--no-fabricmanager\" |" "$F"
+  if grep -q "no-persistenced" "$F"; then echo "[$H] hook patched"; else
+    echo "[$H] ERROR: patch did not take (hook format changed?) — restoring backup"; sudo cp "$F.bak" "$F"; exit 1
+  fi'
+
+# 2) Import the SGLang image to the shared squashfs. enroot import needs a node-local
+#    NON-overlay fs for its temp (overlay / can't mknod the overlayfs whiteouts) — the
+#    step auto-detects one (honors $ENROOT_SCRATCH if set).
+if [[ -f "$SQSH" ]]; then
+    trlog "image already imported: $SQSH"
+else
+    trlog "importing $DOCKER_IMAGE -> $SQSH (auto-detect ext4 temp; multi-GB, ~minutes) ..."
+    mkdir -p "$ENROOT_DIR"
+    $STEP -N1 -w "$PREFILL_NODE" bash -c '
+      set -e
+      S=""
+      for c in "$ENROOT_SCRATCH" /scratch /raid /mnt/local /mnt/resource /var/tmp /tmp; do
+        [ -n "$c" ] || continue
+        [ -d "$c" ] || mkdir -p "$c" 2>/dev/null || continue
+        t=$(stat -f -c %T "$c" 2>/dev/null)
+        case "$t" in overlayfs|overlay|tmpfs|"") continue;; esac
+        mkdir -p "$c/enroot" 2>/dev/null || continue
+        S="$c/enroot"; break
+      done
+      [ -n "$S" ] || { echo "[$(hostname)] ERROR: no node-local non-overlay scratch (set ENROOT_SCRATCH)"; exit 1; }
+      export ENROOT_CACHE_PATH="$S/cache" ENROOT_TEMP_PATH="$S/tmp" TMPDIR="$S/tmp"
+      mkdir -p "$ENROOT_CACHE_PATH" "$ENROOT_TEMP_PATH"
+      echo "[$(hostname)] enroot temp on $S ($t)"
+      enroot import -o "$SQSH" "$DOCKER_IMAGE"'
+fi
+
+# 3) Prestage weights to the shared FS (zero-download launches).
+if [[ -f "$MODEL_DIR/config.json" ]]; then
+    trlog "weights present: $MODEL_DIR"
+else
+    trlog "downloading $MODEL_HF_ID -> $MODEL_DIR ..."
+    mkdir -p "$MODEL_DIR"
+    $STEP -N1 -w "$DECODE_NODE" \
+      --container-image="$SQSH" \
+      --container-mounts="$HF_CACHE:/root/.cache/huggingface,$MODELS_ROOT:$MODELS_ROOT" \
+      bash -c 'export HF_TOKEN=$(cat /root/.cache/huggingface/token 2>/dev/null)
+               hf download "$MODEL_HF_ID" --local-dir "$MODEL_DIR"'
+fi
+trlog "setup complete. (allocation $ALLOC_JOB held; next: bash 01_preflight.sh)"
diff --git a/together_runner/slurm-disagg/01_preflight.sh b/together_runner/slurm-disagg/01_preflight.sh
new file mode 100755
index 000000000..5ef2892ca
--- /dev/null
+++ b/together_runner/slurm-disagg/01_preflight.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# Preflight: auto-detect + verify the RDMA/KV path BEFORE launching servers, so a
+# new cluster fails in seconds with a clear reason instead of after a multi-minute
+# server startup. Resolves IB_DEVICES (GPU<->NIC topology) and the
+# WITH_NVIDIA_PEERMEM decision (peermem vs dmabuf), checks IB port state on both
+# nodes, and verifies the /dev/infiniband bind-mount + dmabuf inside the container.
+# Writes the resolved values to $LOG_DIR/disagg_detected.env (sourced by 10_launch.sh).
+#
+#   bash 01_preflight.sh            # detect + check, write detected.env
+#   PROBE_MOONCAKE=1 bash 01_preflight.sh   # + heavy mooncake register_memory probe
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_nodes   # nodes/ALLOC_JOB only — re-derive IB/peermem from clean config each run
+alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; }
+DETECTED="$LOG_DIR/disagg_detected.env"
+SR="srun --jobid=$ALLOC_JOB --overlap --export=ALL -t 5"
+fail=0
+
+# --- 1. IB ports ACTIVE/LinkUp on BOTH nodes (host-side, fast) ---
+trlog "checking IB port state on $PREFILL_NODE,$DECODE_NODE ..."
+$SR -N2 --ntasks-per-node=1 --nodelist="$PREFILL_NODE,$DECODE_NODE" bash -c '
+  n=0; act=0
+  for d in /sys/class/infiniband/*; do
+    [ -e "$d/ports/1/state" ] || continue; n=$((n+1))
+    s=$(cat "$d/ports/1/state"); p=$(cat "$d/ports/1/phys_state")
+    [[ "$s" == *ACTIVE* && "$p" == *LinkUp* ]] && act=$((act+1))
+  done
+  echo "[$(hostname)] IB HCAs: $act/$n ACTIVE+LinkUp"
+  [ "$act" -gt 0 ] || { echo "[$(hostname)] ERROR: no ACTIVE IB port"; exit 1; }
+' || { trerr "IB port check failed"; fail=1; }
+
+# --- 2. Resolve IB_DEVICES (topology) + container RDMA check on prefill node ---
+CHK_OUT="$LOG_DIR/.preflight_chk.out"
+trlog "detecting GPU<->NIC topology + checking RDMA inside container on $PREFILL_NODE ..."
+$SR -N1 --nodelist="$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \
+  --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
+  bash -c "
+    echo '### DETECT ###'
+    python3 /inferencex/together_runner/slurm-disagg/_detect_rdma.py
+    echo '### CHECK ###'
+    python3 /inferencex/together_runner/slurm-disagg/_check_container_rdma.py ${PROBE_MOONCAKE:+--mooncake}
+  " 2>&1 | tee "$CHK_OUT" || { trerr "container RDMA check failed (see above)"; fail=1; }
+
+# Pull machine-readable values out of the captured output.
+DET_IB="$(grep -m1 '^IB_DEVICES=' "$CHK_OUT" | cut -d= -f2- || true)"
+DET_LL="$(grep -m1 '^IB_LINK_LAYER=' "$CHK_OUT" | cut -d= -f2- || true)"
+DMABUF="$(grep -m1 '^DMABUF_SUPPORTED=' "$CHK_OUT" | cut -d= -f2- || true)"
+
+# Honor an explicit override; else use the detected list.
+IB_FINAL="${IB_DEVICES:-$DET_IB}"
+[[ -n "$IB_FINAL" ]] || { trerr "could not resolve IB_DEVICES (set it explicitly in config.env)"; fail=1; }
+[[ -n "${IB_DEVICES:-}" && -n "$DET_IB" && "$IB_DEVICES" != "$DET_IB" ]] && \
+  trlog "NOTE: explicit IB_DEVICES ($IB_DEVICES) differs from detected ($DET_IB) — using explicit."
+
+# --- 3. peermem vs dmabuf decision (host-side: module presence + driver version) ---
+PEERMEM_FINAL="${WITH_NVIDIA_PEERMEM:-}"
+if [[ -z "$PEERMEM_FINAL" ]]; then
+  trlog "deciding KV mem-registration path (nvidia_peermem vs dmabuf) ..."
+  DEC="$($SR -N1 --nodelist="$PREFILL_NODE" bash -c '
+    drv=$(cat /sys/module/nvidia/version 2>/dev/null); maj=${drv%%.*}
+    if modinfo nvidia_peermem >/dev/null 2>&1; then echo "peermem $drv"; else echo "dmabuf $drv $maj"; fi')"
+  read -r MODE DRV MAJ <<<"$DEC"
+  if [[ "$MODE" == "peermem" ]]; then
+    trlog "nvidia_peermem AVAILABLE (driver $DRV) — using default peermem path (WITH_NVIDIA_PEERMEM unset)."
+  else
+    if [[ "${MAJ:-0}" -ge 535 ]]; then
+      PEERMEM_FINAL=0
+      trlog "nvidia_peermem ABSENT (driver $DRV ≥535) — forcing dmabuf (WITH_NVIDIA_PEERMEM=0)."
+      [[ "$DMABUF" == "1" ]] || { trerr "dmabuf chosen but libibverbs lacks ibv_reg_dmabuf_mr — KV transfer will fail"; fail=1; }
+    else
+      trerr "nvidia_peermem ABSENT and driver $DRV <535 (no dmabuf) — KV transfer cannot register GPU mem"; fail=1
+    fi
+  fi
+else
+  trlog "WITH_NVIDIA_PEERMEM explicitly set to '$PEERMEM_FINAL' — honoring it."
+fi
+
+# --- 4. write the resolved truth source ---
+if [[ "$fail" == "0" ]]; then
+  mkdir -p "$LOG_DIR"
+  cat > "$DETECTED" <<EOF
+# Auto-resolved by 01_preflight.sh — sourced by 10_launch.sh. Re-run preflight to refresh.
+IB_DEVICES=$IB_FINAL
+IB_LINK_LAYER=${DET_LL:-unknown}
+WITH_NVIDIA_PEERMEM=$PEERMEM_FINAL
+EOF
+  rm -f "$CHK_OUT"
+  trlog "PREFLIGHT OK → $DETECTED"
+  trlog "  IB_DEVICES=$IB_FINAL"
+  trlog "  WITH_NVIDIA_PEERMEM=${PEERMEM_FINAL:-<unset:peermem>}"
+  trlog "next: bash 10_launch.sh"
+else
+  trerr "PREFLIGHT FAILED — fix the above before launching (detected.env NOT written)."; exit 1
+fi
diff --git a/together_runner/slurm-disagg/10_launch.sh b/together_runner/slurm-disagg/10_launch.sh
new file mode 100755
index 000000000..9d8d59c27
--- /dev/null
+++ b/together_runner/slurm-disagg/10_launch.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+# Launch the 1P1D disaggregated endpoint as overlap steps INTO the 2-node allocation:
+# prefill@PREFILL_NODE + decode@DECODE_NODE + sgl-router@PREFILL_NODE. Waits for health
+# and writes a state file (endpoint) for benchmark/teardown.
+#
+#   bash 10_launch.sh
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_resolved
+alloc_alive "${ALLOC_JOB:-}" || { trerr "no live allocation — run 00_setup.sh first"; exit 1; }
+STATE="$LOG_DIR/disagg_state.env"
+
+# Resolved RDMA config (IB_DEVICES + WITH_NVIDIA_PEERMEM) comes from preflight.
+DETECTED="$LOG_DIR/disagg_detected.env"
+[[ -f "$DETECTED" ]] || { trlog "no $DETECTED — running 01_preflight.sh first ..."; bash "$HERE/01_preflight.sh"; }
+source "$DETECTED"
+[[ -n "${IB_DEVICES:-}" ]] || { trerr "IB_DEVICES unresolved after preflight"; exit 1; }
+
+MOUNTS="$(container_mounts)"
+CG="$(cuda_graph_arg)"
+# peermem path: empty => default (peermem); set => prefix WITH_NVIDIA_PEERMEM=<val> (dmabuf when 0).
+PEERMEM_PREFIX=""; [[ -n "${WITH_NVIDIA_PEERMEM:-}" ]] && PEERMEM_PREFIX="WITH_NVIDIA_PEERMEM=$WITH_NVIDIA_PEERMEM "
+STEP="srun --jobid=$ALLOC_JOB --overlap --export=ALL"
+trlog "RDMA: IB_DEVICES=$IB_DEVICES  peermem=${WITH_NVIDIA_PEERMEM:-<unset>}  alloc=$ALLOC_JOB"
+
+# --- prefill (with KV bootstrap server) ---
+trlog "launching prefill on $PREFILL_NODE (TP$TP, cuda_graph=$([[ -z $CG ]] && echo on || echo off)) ..."
+: > "$LOG_DIR/prefill.log"
+nohup $STEP -N1 -w "$PREFILL_NODE" --gres=gpu:$GPUS_PER_NODE \
+  --container-image="$SQSH" --container-mounts="$MOUNTS" \
+  bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \
+    --model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \
+    --host 0.0.0.0 --port $PORT --trust-remote-code \
+    --disaggregation-mode prefill --disaggregation-bootstrap-port $BOOTSTRAP_PORT \
+    --disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/prefill.log 2>&1" >/dev/null 2>&1 &
+
+# --- decode ---
+trlog "launching decode on $DECODE_NODE (TP$TP) ..."
+: > "$LOG_DIR/decode.log"
+nohup $STEP -N1 -w "$DECODE_NODE" --gres=gpu:$GPUS_PER_NODE \
+  --container-image="$SQSH" --container-mounts="$MOUNTS" \
+  bash -c "${PEERMEM_PREFIX}python3 -m sglang.launch_server \
+    --model-path $MODEL_DIR --served-model-name $SERVED_NAME --tp $TP \
+    --host 0.0.0.0 --port $PORT --trust-remote-code \
+    --disaggregation-mode decode \
+    --disaggregation-ib-device $IB_DEVICES $CG > $LOG_DIR/decode.log 2>&1" >/dev/null 2>&1 &
+
+trlog "waiting for both servers to be healthy (cuda-graph capture can take several minutes) ..."
+wait_health "http://$PREFILL_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "prefill never healthy; see $LOG_DIR/prefill.log"; exit 1; }
+trlog "prefill healthy."
+wait_health "http://$DECODE_IP:$PORT/health" 1800 "$ALLOC_JOB" || { trerr "decode never healthy; see $LOG_DIR/decode.log"; exit 1; }
+trlog "decode healthy."
+
+# --- router: another overlap step on the prefill node. MUST mount the model dir so the
+#     tokenizer loads locally (else 404s to HF). ---
+trlog "launching sgl-router on $PREFILL_NODE:$ROUTER_PORT ..."
+: > "$LOG_DIR/router.log"
+nohup $STEP -N1 -w "$PREFILL_NODE" \
+  --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
+  bash -c "python3 -m sglang_router.launch_router --pd-disaggregation \
+    --prefill http://$PREFILL_IP:$PORT $BOOTSTRAP_PORT --decode http://$DECODE_IP:$PORT \
+    --host 0.0.0.0 --port $ROUTER_PORT --policy random > $LOG_DIR/router.log 2>&1" >/dev/null 2>&1 &
+
+wait_health "http://$PREFILL_IP:$ROUTER_PORT/health" 120 "$ALLOC_JOB" || { trerr "router never healthy; see $LOG_DIR/router.log"; exit 1; }
+
+cat > "$STATE" <<EOF
+ALLOC_JOB=$ALLOC_JOB
+ENDPOINT=http://$PREFILL_IP:$ROUTER_PORT
+EOF
+trlog "ENDPOINT READY: http://$PREFILL_IP:$ROUTER_PORT  (state: $STATE)"
+trlog "next: bash 20_benchmark.sh   |   teardown: bash teardown.sh"
diff --git a/together_runner/slurm-disagg/20_benchmark.sh b/together_runner/slurm-disagg/20_benchmark.sh
new file mode 100755
index 000000000..bbf6589e1
--- /dev/null
+++ b/together_runner/slurm-disagg/20_benchmark.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Concurrency sweep against the live disagg endpoint using the InferenceX unified
+# client (utils/bench_serving/benchmark_serving.py). Reuses one warm router.
+# Saves a result JSON per concurrency under RESULTS_DIR and prints a summary table.
+#
+#   bash 20_benchmark.sh
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_resolved
+STATE="$LOG_DIR/disagg_state.env"
+[[ -f "$STATE" ]] || { trerr "no state file ($STATE) — run 10_launch.sh first"; exit 1; }
+source "$STATE"
+alloc_alive "${ALLOC_JOB:-}" || { trerr "allocation $ALLOC_JOB gone — relaunch"; exit 1; }
+mkdir -p "$RESULTS_DIR"
+
+curl -sf -m4 "$ENDPOINT/health" >/dev/null || { trerr "endpoint $ENDPOINT not healthy"; exit 1; }
+SEQTAG="$(( ISL/1024 ))k$(( OSL/1024 ))k"
+trlog "sweep CONC_LIST='$CONC_LIST' at ISL=$ISL OSL=$OSL via $ENDPOINT"
+
+# Run the bench client inside the image (mount repo at /inferencex + model for tokenizer),
+# as an overlap step on the prefill allocation. One srun does the whole sweep.
+nohup srun --jobid="$ALLOC_JOB" --overlap --nodelist="$PREFILL_NODE" \
+  --container-image="$SQSH" --container-mounts="$(container_mounts "$REPO_ROOT:/inferencex:ro")" \
+  bash -c "
+    pip install -q datasets pandas >/dev/null 2>&1 || true
+    for C in $CONC_LIST; do
+      NP=\$(( C * $PROMPTS_PER_CONC )); [ \$NP -lt 160 ] && NP=160
+      echo \"############ conc=\$C num_prompts=\$NP ############\"
+      python3 /inferencex/utils/bench_serving/benchmark_serving.py \
+        --backend sglang --model $SERVED_NAME --tokenizer $MODEL_DIR \
+        --base-url $ENDPOINT --endpoint /v1/completions \
+        --dataset-name random --random-input-len $ISL --random-output-len $OSL \
+        --max-concurrency \$C --num-prompts \$NP --percentile-metrics ttft,tpot,itl,e2el \
+        --save-result --result-dir $RESULTS_DIR \
+        --result-filename ${SERVED_NAME}_${SEQTAG}_conc\${C}.bench.json 2>&1 \
+        | grep -E 'Successful requests|Total Token throughput|Output token throughput|Median TTFT|P99 TTFT|Median TPOT|Median E2EL'
+      echo \"=== conc=\$C done ===\"
+    done
+    echo ALL_SWEEP_DONE" 2>&1 | tee "$LOG_DIR/sweep.log"
+
+# Summary table from the result JSONs.
+trlog "==== SWEEP SUMMARY ($SEQTAG) ===="
+python3 - "$RESULTS_DIR" "$SERVED_NAME" "$SEQTAG" $CONC_LIST <<'PY'
+import json, sys, os
+rdir, name, seqtag = sys.argv[1], sys.argv[2], sys.argv[3]
+concs = sys.argv[4:]
+hdr = f"{'conc':>5} {'ok':>11} {'total tok/s':>12} {'out tok/s':>10} {'mTPOT ms':>9} {'mTTFT ms':>9} {'p99TTFT ms':>11}"
+print(hdr); print('-'*len(hdr))
+for C in concs:
+    f = os.path.join(rdir, f"{name}_{seqtag}_conc{C}.bench.json")
+    if not os.path.exists(f): print(f"{C:>5}  (missing)"); continue
+    d = json.load(open(f))
+    ok = f"{d.get('completed')}/{d.get('num_prompts','?')}"
+    print(f"{C:>5} {ok:>11} {d.get('total_token_throughput',0):>12.0f} {d.get('output_throughput',0):>10.0f} "
+          f"{d.get('median_tpot_ms',0):>9.1f} {d.get('median_ttft_ms',0):>9.0f} {d.get('p99_ttft_ms',0):>11.0f}")
+PY
+trlog "raw results: $RESULTS_DIR  | sweep log: $LOG_DIR/sweep.log"
diff --git a/together_runner/slurm-disagg/BENCHMARK-RECORD-qwen3-32b-disagg.md b/together_runner/slurm-disagg/BENCHMARK-RECORD-qwen3-32b-disagg.md
new file mode 100644
index 000000000..3626c9aeb
--- /dev/null
+++ b/together_runner/slurm-disagg/BENCHMARK-RECORD-qwen3-32b-disagg.md
@@ -0,0 +1,77 @@
+# Benchmark record — Qwen3-32B prefill/decode disaggregation (2-node, SGLang)
+
+**Date:** 2026-06-29 · **Cluster:** slinky (Slurm-on-k8s) · **Operator:** Johnsonms
+
+## Purpose
+ClusterMAX inference-disagg phase-0 readiness proof: confirm this tenant slice can
+deploy a 2-node prefill/decode-disaggregated SGLang endpoint, transfer KV cross-node
+over RDMA, and serve at a representative throughput. Bring-up model: **Qwen/Qwen3-32B**
+(dense), **1P1D** (1 prefill + 1 decode + sgl-router).
+
+## Config
+- **Topology:** prefill@slinky-0 (TP8), decode@slinky-1 (TP8), sgl-router@slinky-0:8002, `--policy random`.
+- **Hardware:** 2× node, each 8× B200 + 14× mlx5 IB. 16 GPUs total in the serving path.
+- **Image:** `lmsysorg/sglang:dev-cu13` (sglang `0.0.0.dev1+g909123ddb`, sglang-router 0.3.2,
+  mooncake `0.3.11.post1`), shared squashfs `/data/home/johnson/enroot/sglang-dev-cu13.sqsh`.
+- **KV transfer:** mooncake over RDMA, **dmabuf path forced via `WITH_NVIDIA_PEERMEM=0`**
+  (nvidia_peermem absent; driver 580). IB devices (GPU-adjacent, GPU-order):
+  `mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_4,mlx5_5,mlx5_6,mlx5_7`. `/dev/infiniband` bind-mounted.
+- **Serving args:** `--tp 8 --trust-remote-code`, **CUDA graph ON**, prefill
+  `--disaggregation-mode prefill --disaggregation-bootstrap-port 9000`, decode `--disaggregation-mode decode`.
+- **Bench client:** InferenceX `utils/bench_serving/benchmark_serving.py`, `--backend sglang`,
+  `--endpoint /v1/completions`, `--dataset-name random`, ISL=1024 OSL=1024, num_prompts=conc×8.
+
+## Results (1k/1k, CUDA graph)
+
+| conc | requests | total tok/s | output tok/s | req/s | med TPOT (ms) | med TTFT (ms) | p99 TTFT (ms) | med E2E (ms) |
+|---:|---:|---:|---:|---:|---:|---:|---:|---:|
+| 16  | 160/160   | 2,087 | 957   | 1.10 | 13.9 | 1,591  | 4,564  | 15,560 |
+| 64  | 512/512   | 4,591 | 2,110 | 2.37 | 23.9 | 3,616  | 18,651 | 28,010 |
+| 128 | 1024/1024 | 4,512 | 2,080 | 2.37 | 34.1 | 6,629  | 39,334 | 41,623 |
+| 256 | 2048/2048 | 4,898 | 2,243 | 2.59 | 43.1 | 26,913 | 80,696 | 66,562 |
+
+**0 failed requests across the entire sweep.** Raw per-point JSON: `/data/home/johnson/enroot/sweep/qwen3-32b_1k1k_conc{16,64,128,256}.bench.json`.
+
+### Post-refactor re-validation (2026-06-30, full sweep, allocation model)
+Re-ran the full sweep after the portability refactor: single 2-node `salloc --no-shell`
+allocation, prefill/decode/router/bench as overlap steps, `IB_DEVICES` + dmabuf decision
+auto-detected (IB list matched the hand-derived `mlx5_9,10,11,12,4,5,6,7` exactly).
+**0 failed requests across all points.**
+
+| conc | requests | total tok/s | output tok/s | med TPOT (ms) | med TTFT (ms) | p99 TTFT (ms) | vs 06-29 |
+|---:|---:|---:|---:|---:|---:|---:|---:|
+| 16  | 160/160   | 2,156 | 993   | 13.7 | 1,405  | 5,954  | +3% |
+| 64  | 512/512   | 4,614 | 2,127 | 23.0 | 3,933  | 20,987 | +0.5% |
+| 128 | 1024/1024 | 3,298 | 1,526 | 33.1 | 8,844  | 36,889 | **−27% (see note)** |
+| 256 | 2048/2048 | 4,892 | 2,229 | 42.2 | 27,071 | 79,414 | −0.1% |
+
+**conc=128 anomaly:** reproducibly ~3.3–3.4k (two runs: 3,437 then 3,298), i.e. *below*
+conc=64 — physically inconsistent with a healthy curve, and below the 06-29 baseline of
+4,512. NOT noise (reproduces) and NOT a code regression (serving args/nodes/IB are byte-for-byte
+unchanged; conc 16/64/256 match baseline). Read as a **1P1D dynamics artifact** at this
+concurrency: the single prefill instance interleaves badly with decode (prefill bursts starve
+decode, TTFT jumps to ~8.8s), whereas conc=64 stays stable and conc=256 is fully decode-saturated.
+A candidate to revisit when scaling prefill (NP1D) or tuning chunked-prefill scheduling.
+
+## Findings
+1. **Functional:** cross-node KV transfer works under sustained load — the whole path
+   (deploy → RDMA-in-pod → bootstrap → KV transfer → router → serve) is solid, 0 errors.
+2. **Peak throughput ≈ 4,900 tok/s total (~306 tok/s/GPU over 16 GPUs).**
+3. **Saturates at conc ≥ 64** (~4,500–4,900 tok/s). Adding concurrency past 64 yields ~no
+   extra throughput but sharply worse TTFT (p99 4.6 s → 80.7 s @ conc 256).
+4. **Bottleneck = the single prefill instance** (1P1D): decode TPOT stays healthy (14–43 ms)
+   while TTFT explodes as prefill queues. To raise throughput / cut TTFT tail → add prefill
+   instances (e.g. 2P1D / NP1D) and/or enable chunked-prefill tuning.
+
+## Critical fixes that made this work (env-specific, ephemeral on pod restart)
+- **`WITH_NVIDIA_PEERMEM=0`** on prefill+decode → mooncake dmabuf KV registration (else
+  3072× `Failed to register memory` → `KVTransferError`). [found by codex]
+- **CUDA graph ON** is mandatory for perf: eager mode gave **85 tok/s total / 366 ms TPOT /
+  65% client-timeout failures** at conc 16 — a ~22× throughput / ~27× TPOT regression vs graphs on.
+- enroot nvidia hook patched `--no-persistenced --no-fabricmanager`; `/dev/infiniband` mounted;
+  enroot import temp on ext4 `/scratch`. (See HANDOVER-disagg-kv-transfer.md / HANDBACK-codex-dmabuf.md.)
+
+## Provenance
+Nodes slinky-0/1; sweep run as overlap step on prefill job 61 (decode job 62), router :8002.
+Server logs: `/data/home/johnson/enroot/{prefill-cg,decode-cg,router-sweep}.log`. Sweep
+driver log: `/data/home/johnson/enroot/sweep.log`.
diff --git a/together_runner/slurm-disagg/CLAUDE.md b/together_runner/slurm-disagg/CLAUDE.md
new file mode 100644
index 000000000..57848a31c
--- /dev/null
+++ b/together_runner/slurm-disagg/CLAUDE.md
@@ -0,0 +1,60 @@
+# slurm-disagg — Claude Code memory (quick debug/ramp-up)
+
+2-node SGLang **prefill/decode-disaggregated** benchmark harness on Slurm + enroot/pyxis.
+Productized ClusterMAX disagg phase-0 proof. **Portable**: meant to run unmodified on any
+such cluster. Sibling of `../CLAUDE.md` (single-node together_runner) but separate code.
+
+## State (2026-06-30)
+- Committed on branch `together-runner-slurm-disagg`, **PR #2** open vs main
+  (togethercomputer/InferenceX). Author `Johnsonms <lizhaofu@gmail.com>`, no Claude trailer.
+- Validated on slinky (Qwen3-32B 1P1D, TP8): conc 16/64/256 match 06-29 baseline, 0 failures.
+- **OPEN issue**: conc=128 reproducible throughput dip (~3.3k, below conc64). NOT the refactor
+  (CPU alloc identical old vs new — proven). See `INVESTIGATE-conc128.md`. Deferred.
+
+## Architecture (how it runs)
+ONE 2-node allocation, everything is an overlap step into it:
+```
+00_setup.sh   -> salloc -N2 --no-shell (ALLOC_JOB) ; patch enroot nvidia hook ; enroot import ; hf download
+01_preflight  -> auto-detect IB_DEVICES + peermem/dmabuf ; verify IB ports + bind-mount + dmabuf
+10_launch.sh  -> prefill@node0 + decode@node1 + router@node0  (all `srun --jobid=$ALLOC --overlap`)
+20_benchmark  -> bench client (overlap step on node0) -> result JSONs + summary
+teardown.sh   -> scancel $ALLOC_JOB ; rm nodes/state files
+run_all.sh    -> 00 -> 01 -> 10 -> 20
+```
+Resolved state lives in `$LOG_DIR` (= `$ENROOT_DIR` = `$HOME/enroot`):
+- `disagg_nodes.env` — ALLOC_JOB, PREFILL/DECODE_NODE+IP, partition (written by 00_setup).
+- `disagg_detected.env` — IB_DEVICES, WITH_NVIDIA_PEERMEM (written by 01_preflight).
+- `disagg_state.env` — ENDPOINT (written by 10_launch).
+- `load_nodes()` (nodes only, for preflight) vs `load_resolved()` (nodes + detected, for launch/bench).
+
+## What's AUTO vs PINNED
+- AUTO: partition (first w/ ≥2 idle GPU nodes), nodes (Slurm-assigned), IB_DEVICES
+  (`nvidia-smi topo -m` + `/sys` link_layer, majority-fabric), WITH_NVIDIA_PEERMEM
+  (peermem present→default; absent+drv≥535→0/dmabuf), enroot temp (first non-overlay fs).
+- PINNED on purpose: GPUS_PER_NODE=8, TP=8 (B200). Override anything via env / config.env.
+
+## Gotchas / hard-won facts (don't relearn these)
+- **enroot nvidia-hook sed-patch is unavoidable** (needs sudo, ephemeral). enroot 4.0.1 runs
+  system+user hooks.d with NO basename dedup (user hook can't override system 98-nvidia.sh);
+  pyxis ignores per-job ENROOT_SYSCONF_PATH. Both verified. Patch is idempotent + self-checking.
+- **KV transfer needs the dmabuf path** here (`WITH_NVIDIA_PEERMEM=0`) — `nvidia_peermem` absent
+  (driver 580, only gdrdrv+nvidia_fs). Without it: 3072× `Failed to register memory` → KVTransferError.
+- **CUDA graph mandatory** (default ON) — eager ≈ 22× slower / 65% timeouts.
+- **enroot import temp must be non-overlay fs** — pod `/` is overlayfs, can't mknod whiteouts.
+- **Every step gets only 2 CPUs** on slinky (CR_CORE_MEMORY, no DefCpuPerGPU) — suspected
+  throughput cap; see INVESTIGATE-conc128.md item 1 (`--exclusive`/`--cpus-per-task`).
+- Router MUST mount the model dir (else tokenizer 404s to HF); runs on the prefill node.
+
+## Debug entry points
+- Server logs: `$HOME/enroot/{prefill,decode,router}.log`. Sweep: `$HOME/enroot/sweep.log`,
+  results `$HOME/enroot/sweep/*.bench.json`.
+- Liveness while running: `tail -f decode.log` (look for `Decode batch ... gen throughput`),
+  `squeue --me` (ALLOC_JOB), endpoint `curl $ENDPOINT/health`.
+- Re-detect RDMA only: `bash 01_preflight.sh` (rewrites detected.env from clean config).
+- Re-run subset: `CONC_LIST="64" bash 20_benchmark.sh` (endpoint must be up).
+- Cross-cluster: just `bash run_all.sh` — if it fails, preflight prints the exact reason.
+
+## Related docs
+`README.md` (usage), `PORTABILITY-ANALYSIS.md` (root-cause table + decisions),
+`BENCHMARK-RECORD-qwen3-32b-disagg.md` (numbers), `INVESTIGATE-conc128.md` (open perf question).
+Testbed/node fixes: see repo memory `disagg-enroot-node-setup`.
diff --git a/together_runner/slurm-disagg/INVESTIGATE-conc128.md b/together_runner/slurm-disagg/INVESTIGATE-conc128.md
new file mode 100644
index 000000000..1f75cd519
--- /dev/null
+++ b/together_runner/slurm-disagg/INVESTIGATE-conc128.md
@@ -0,0 +1,95 @@
+# Investigation: conc=128 throughput dip (1P1D, Qwen3-32B, slinky)
+
+Status: **OPEN — deferred**. Captured 2026-06-30 so we can resume without re-deriving.
+Owner: johnson. Related: `BENCHMARK-RECORD-qwen3-32b-disagg.md`, `PORTABILITY-ANALYSIS.md`.
+
+## The observation
+1k/1k sweep, 1P1D, TP8 each. **0 failed requests at every point.** conc 16/64/256 match the
+06-29 baseline; **conc=128 is reproducibly low**:
+
+| conc | 06-29 baseline | 06-30 run1 | 06-30 run2 | note |
+|---:|---:|---:|---:|---|
+| 16  | 2,087 | 1,874 | 2,156 | run1 low, run2 on-baseline |
+| 64  | 4,591 | 4,614 | —     | ✓ |
+| 128 | 4,512 | 3,437 | 3,298 | **reproducibly ~3.3–3.4k, below conc=64** |
+| 256 | 4,898 | 4,892 | —     | ✓ |
+
+The dip is *physically odd*: conc=128 sits **below** conc=64, and TTFT jumps (med ~6.8–8.8 s,
+p99 ~37 s). On a healthy curve 128 should sit between 64 and 256.
+
+## Is it our refactor? — NO (proven 2026-06-30)
+Hypothesis was that the allocation/overlap-step model (this PR) changed CPU/resource isolation
+vs the old separate-`srun`-job model, starving prefill at the conc=128 operating point.
+
+**Disproved by direct probe** — both models grant the SAME CPUs per node:
+
+```
+# NEW model (overlap step into 'salloc -N2 --gres=gpu:8' allocation):
+srun --jobid=$ALLOC --overlap -N1 -w slinky-0 bash -c 'nproc; grep Cpus_allowed_list /proc/self/status'
+  -> nproc=2   Cpus_allowed_list=0-1   SLURM_CPUS_ON_NODE=2
+
+# OLD model (standalone job, what 06-29 used):
+srun -p slinky -N1 -w slinky-0 --gres=gpu:8 bash -c 'nproc; grep Cpus_allowed_list /proc/self/status'
+  -> nproc=2   Cpus_allowed_list=0-1   SLURM_CPUS_ON_NODE=2
+```
+
+Both = 2 CPUs/node (cluster is `SelectTypeParameters=CR_CORE_MEMORY` with **no `DefCpuPerGPU`**,
+so a gpu:8 request gets the minimal 1–2 cores). Serving args / nodes / IB list are byte-for-byte
+identical between the runs. **So the refactor is not the cause of the 128 dip.**
+
+Most likely: **environmental / 1P1D dynamics variance** — at conc=128 the single prefill instance
+interleaves badly with decode (prefill bursts starve decode), an unstable operating point; 06-29's
+4,512 was a luckier sample. (conc=16 also swung 1,874↔2,156 between runs → this setup has real
+run-to-run variance.)
+
+## Separate finding worth chasing: both models are CPU-starved to 2 cores
+Every server (prefill, decode, router, bench client) runs pinned to **2 CPUs**. CPU-bound work
+(tokenization, scheduler, sampling, FastAPI/uvicorn, mooncake orchestration) is throttled. This
+likely **caps throughput across ALL points**, and more CPU headroom may also **stabilize the 128
+point**. This is the highest-value lever and is independent of the refactor.
+
+## Plan to resume (ranked)
+1. **Give the allocation real CPUs, re-sweep.** Easiest, highest upside. In `disagg_lib.sh`
+   `ensure_allocation`, add to the `salloc`: `--cpus-per-task` (or `--exclusive`, or
+   `--cpus-per-gpu=<n>`). Probe the node core count first (`sinfo -h -n <node> -o '%c'` showed
+   the *allocated* 2, not physical — check `scontrol show node <n> | grep CPUTot`; the box has
+   ~160). Try `--exclusive` (whole node) → re-run conc 128 (and full sweep). Expectation: all
+   points rise; if 128 also normalizes, the dip was CPU contention after all.
+2. **A/B old vs new launch at conc=128**, CPU held constant, to fully close the refactor question
+   (should match — both 2 CPU). Old launch commands preserved below.
+3. **Instrument the 128 point.** While conc=128 runs, watch:
+   - prefill log: `Prefill batch ... #queue-req`, input throughput — is prefill the bottleneck?
+   - decode log: `Decode batch ... #running-req, gen throughput` — is decode starved (running-req
+     oscillating / low) at 128 but steady at 256?
+   - `nvidia-smi dmon` / GPU util on both nodes; CPU util of the 2 allowed cores (likely pegged).
+4. **Sweep around it**: conc 96 / 128 / 160 / 192 to map whether it's a single bad point or a
+   trough between 64 and 256.
+
+## Reproducing the ORIGINAL (06-29) conditions — so we don't lose the 4,512 baseline
+The 06-29 run used the OLD launch (separate jobs, hardcoded nodes) + first 20_benchmark. That code
+was refactored in place (not in git), but the exact commands are preserved in
+`~/enroot/HANDOVER-disagg-kv-transfer.md` + `HANDBACK-codex-dmabuf.md`, and reproduced here:
+
+```bash
+IMG=$HOME/enroot/sglang-dev-cu13.sqsh
+MODEL=$HOME/models/qwen3-32b
+IB=mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_4,mlx5_5,mlx5_6,mlx5_7
+MOUNTS=$MODEL:$MODEL,$HOME/enroot:$HOME/enroot,$HOME/.cache/huggingface:/root/.cache/huggingface,/dev/infiniband:/dev/infiniband
+
+# PREFILL (slinky-0) — standalone job, CUDA graph ON (omit --disable-cuda-graph):
+srun -p slinky -N1 -w slinky-0 --gres=gpu:8 --job-name=pd_prefill \
+  --container-image=$IMG --container-mounts=$MOUNTS \
+  bash -c 'WITH_NVIDIA_PEERMEM=0 python3 -m sglang.launch_server --model-path '$MODEL' \
+    --served-model-name qwen3-32b --tp 8 --host 0.0.0.0 --port 30000 --trust-remote-code \
+    --disaggregation-mode prefill --disaggregation-bootstrap-port 9000 --disaggregation-ib-device '$IB
+# DECODE (slinky-1): same but --disaggregation-mode decode (no bootstrap port).
+# ROUTER: srun --jobid=<prefill_jobid> --overlap -w slinky-0 ... launch_router --pd-disaggregation \
+#   --prefill http://<pf_ip>:30000 9000 --decode http://<dec_ip>:30000 --port 8002 --policy random
+```
+Note: 06-29 also ran on **2 CPUs/node** (same default), so the 4,512 should be reproducible under
+the same conditions — the gap is variance, not a lost config. For a *deterministic* baseline, pin
+CPUs via `--exclusive` (item 1) in BOTH old and new and compare.
+
+## Not a blocker
+0 failures, correctness intact, peak (4,892) matches baseline. conc=128 is a perf-shape question,
+documented for later — does not block the harness landing.
diff --git a/together_runner/slurm-disagg/PORTABILITY-ANALYSIS.md b/together_runner/slurm-disagg/PORTABILITY-ANALYSIS.md
new file mode 100644
index 000000000..814c59a63
--- /dev/null
+++ b/together_runner/slurm-disagg/PORTABILITY-ANALYSIS.md
@@ -0,0 +1,82 @@
+# slurm-disagg — 坑梳理与可移植性重构设计
+
+整个 2-node SGLang PD-disagg bring-up 过程中遇到的坑、根因、以及让方案在**新集群**
+上仍然稳定的重构计划。bring-up 已在 slinky(Slurm-on-k8s, 2×8B200)用 Qwen3-32B
+1P1D 跑通并出 benchmark(见 BENCHMARK-RECORD),本文是「跑通 → 可靠固化」的依据。
+
+## 坑分类（关键维度：换新集群会不会爆）
+
+| # | 坑 | 类别 | 新集群风险 |
+|---|---|---|---|
+| 1 | enroot import temp 必须 ext4 | A 环境硬约束 | 低 |
+| 2 | enroot nvidia hook sed 补丁 (`--no-persistenced --no-fabricmanager`) | C 可消除 hack | 中 |
+| 3 | bind-mount `/dev/infiniband` | A 环境硬约束 | 低 |
+| 4 | router `--jobid=<prefill> --overlap` | A Slurm gang sched | 低 |
+| 5 | router 必须挂 model dir | A 配置正确性 | 低 |
+| 6 | `WITH_NVIDIA_PEERMEM=0`（强制 dmabuf） | B 需检测 | 高 |
+| 7 | `IB_DEVICES` GPU↔NIC 邻接表硬编码 | B 需检测 | 最高 |
+| 8 | CUDA graph 必须开 | D 性能默认 | 低 |
+
+## 根因
+
+- **#1** pod 的 `/` 是 overlayfs，`enroot import` 的 `mknod` whiteout 在 overlay-on-overlay
+  上被内核禁止 → `aufs2ovlfs ... Operation not permitted`。已用 `ENROOT_TEMP_PATH=/scratch`（ext4）解。正确解，非 hack。
+- **#2** `nvidia-container-cli` 想 bind-mount persistenced/fabricmanager 的 socket，pod 里没有 → 启动失败。当前用 `sudo sed -i` 改全局 hook（最脏：sudo + 改系统文件 + 重启重做）。
+- **#3** pyxis 不自动透传 IB 字符设备。bind-mount 即正确解。
+- **#4** prefill 用满 `gres=gpu:8`，Slurm 不在满载节点上调度新 job；router 不要 GPU，用 overlap step 进 prefill 的 alloc。
+- **#5** router 加载 tokenizer，没有本地 model dir 就 404 到 HF。
+- **#6** `nvidia_peermem` 模块缺失（驱动 580，仅 gdrdrv+nvidia_fs）；mooncake 默认走 peermem `ibv_reg_mr` 注册 GPU 显存失败 → 3072× `Failed to register memory` → `KVTransferError`。`=0` 强制 dmabuf（`ibv_reg_dmabuf_mr`，驱动 ≥535 支持）。探针：256MiB CUDA tensor，无 env 返回 -202，`=0` 返回 0。
+- **#7** `mlx5_9,10,11,12,4,5,6,7` 是本机 GPU0..7→NIC 顺序，且避开 mlx5_0–3（存储 NIC）。物理拓扑，每集群/机型不同（GB200/GB300≠B200）。
+- **#8** eager 比 graph 慢 ~22×、65% 超时。默认 ON。
+
+## 决策（已确认）
+
+1. **环境相关项（#6/#7）默认自动探测**：`IB_DEVICES`、`WITH_NVIDIA_PEERMEM` 默认留空 =
+   preflight 自动探测填充；手动 export 可覆盖。
+2. **#2 尝试消除 → 实验结论：当前栈上无法干净消除,退回加固 sed 补丁。**
+   - 用户级 hook 覆盖**不可行**:enroot 4.0.1 `runtime.sh:93` 对 `[系统, 用户]` 两个
+     hooks.d **各跑一遍、不按文件名去重**,用户那份 `98-nvidia.sh` 只会追加执行,无法
+     替换系统那份(系统那份照样先跑、照样失败)。
+   - `ENROOT_SYSCONF_PATH` 重定向**不可行**:实测 `srun --export=ALL,ENROOT_SYSCONF_PATH=<weka>`
+     启容器,哨兵 hook 未生效 → **pyxis 忽略该变量**。改 pyxis plugstack.conf 又需 root,与 sudo 同级,无收益。
+   - 故保留 `sudo sed` patch,但做成:幂等 + sudo 不可用清晰报错 + patch 后 grep 校验,失败自动还原 .bak。
+
+## 重构计划
+
+1. **新增 `01_preflight.sh`** + 在 `disagg_lib.sh` 加探测函数：
+   - `detect_ib_devices`：`nvidia-smi topo -m` / `/sys/class/infiniband/<dev>/device` PCIe
+     affinity → 按 GPU 顺序生成 `IB_DEVICES`，过滤非 GPU NIC；与手动值比对告警。
+   - `check_ib_ports`：每节点逐端口 `state==ACTIVE` & `phys_state==LinkUp`，记录 `link_layer`
+     (IB vs RoCE)、`rate`。
+   - `probe_dmabuf`：容器内 256MiB CUDA tensor mooncake register_memory，带/不带
+     `WITH_NVIDIA_PEERMEM=0` 各一次 → 自动判定路径；并查 `modinfo nvidia_peermem` + 驱动版本。
+   - `check_ibv_in_container`：ctypes `ibv_get_device_list` 数 == 主机枚举数；
+     `libibverbs` 是否导出 `ibv_reg_dmabuf_mr`。
+2. **`config.env`**：`IB_DEVICES` / `WITH_NVIDIA_PEERMEM` 默认空 → 自动探测。
+3. **#2 hook**：试用户级 hook 覆盖；失败退回幂等 sed + 自检。
+4. **就绪判定**：`wait_health` 外补「cuda graph capture done」标志，避免半就绪。
+5. **收尾**：BENCHMARK-RECORD 收进本目录（README 已相对引用，文件实际在 `~/enroot/`）；补 `.gitignore`。
+
+## 第二轮：消除剩余 hardcode（换集群零改动）
+
+审计发现 IB/peermem 之外还有 5 处 cluster 特定项，处理如下：
+
+| 项 | 原来 | 现在 |
+|---|---|---|
+| 节点名 `slinky-0/1` | `--nodelist` 写死 | **单个 2-node allocation**(`salloc --no-shell`),prefill/decode/router/bench 全是 `--jobid=$ALLOC --overlap` step;节点由 Slurm 分配,持久化到 `disagg_nodes.env` |
+| 分区 `slinky` | 写死 | 空=自动选「含 ≥2 idle GPU 节点」的分区(可 `PARTITION=` 覆盖) |
+| `GPUS_PER_NODE`/`TP`=8 | 写死 | **保留写死**(B200 专用,刻意不探测) |
+| enroot temp `/scratch` | 写死、假设 ext4 | 00_setup 在节点上自动探测首个可写的非 overlay/tmpfs 目录(`/scratch /raid /var/tmp ...`,可 `ENROOT_SCRATCH=` 覆盖) |
+| 路径 `/data/home/johnson/...` | 写死 | 默认 `$HOME/...`;`ENROOT_DIR`/`MODELS_ROOT` 必须在跨节点共享盘(preflight 验证) |
+| `_detect_rdma` 取 `ports/1` | 假设单端口 | (已知小限制)多端口 HCA 需扩展;当前取端口 1 |
+
+架构收益:teardown 只需 `scancel` 一个 allocation;所有 step 共享它;再不依赖任何节点名。
+`load_nodes`(仅节点)给 preflight 用,`load_resolved`(节点+IB/peermem)给 launch/bench 用——
+preflight 每次从干净 config 重新探测,不会把自己上轮产出的 detected.env 误当成用户显式配置。
+
+端到端在 slinky 复测(allocation 模型):Slurm 自动分到 slinky-[0-1],自动探测 IB 列表与
+之前手写一致,KV 传输正常。
+
+完成后端到端重跑出 benchmark，再提交（push fork，PR body→md，无 Claude trailer）。
+
+相关记忆：[[disagg-enroot-node-setup]] · [[inferencex-together-runner]]
diff --git a/together_runner/slurm-disagg/README.md b/together_runner/slurm-disagg/README.md
new file mode 100644
index 000000000..2e7a30a1c
--- /dev/null
+++ b/together_runner/slurm-disagg/README.md
@@ -0,0 +1,76 @@
+# slurm-disagg — 2-node SGLang prefill/decode-disaggregated benchmark
+
+Reproducible harness for the ClusterMAX inference-disagg phase-0 readiness proof:
+deploy a 2-node prefill/decode-disaggregated SGLang endpoint on a Slurm + enroot/pyxis
+cluster, transfer KV cross-node over RDMA, and benchmark.
+
+**Portable by design — runs unmodified on a new cluster.** The harness grabs ONE 2-node
+allocation (`salloc --no-shell`) and runs prefill/decode/router/bench as overlap steps
+into it, so **Slurm picks the nodes** (no node names hardcoded). Partition, IB device
+list, the peermem-vs-dmabuf KV path, and the enroot temp dir are all **auto-detected**;
+only GPUs/node (8) and TP (8) are pinned, on purpose, for B200.
+
+Validated 2026-06-29 with **Qwen3-32B** (1P1D, TP8 each): peak ~4,900 tok/s total
+(~306 tok/s/GPU), 0 failures. See `BENCHMARK-RECORD-qwen3-32b-disagg.md` (re-validated 2026-06-30 post-refactor).
+
+## Run it
+```bash
+cd together_runner/slurm-disagg
+bash 00_setup.sh       # one-time per pod boot: node fixes + image import + weights (idempotent)
+bash 01_preflight.sh   # auto-detect IB devices + peermem path, verify RDMA — seconds; writes detected.env
+bash 10_launch.sh      # prefill + decode + router; waits healthy; writes state file
+bash 20_benchmark.sh   # concurrency sweep -> result JSONs + summary table
+bash teardown.sh       # scancel the servers
+# or: bash run_all.sh  # 00 -> 01 -> 10 -> 20 in sequence
+```
+All config in `config.env` (sourced everywhere; override by exporting first).
+`IB_DEVICES` and `WITH_NVIDIA_PEERMEM` default to **empty = auto-detected by preflight**
+— set them explicitly only to override. `PROBE_MOONCAKE=1 bash 01_preflight.sh` adds the
+heavy mooncake `register_memory` GPU probe.
+
+## Layout
+- `config.env` — paths, model, ports, sweep params. Partition/nodes/IB/peermem/enroot-temp
+  default to AUTO; GPUs/node + TP pinned to 8 (B200). `ENROOT_DIR`/`MODELS_ROOT` default to
+  `$HOME` and must be on a cross-node-shared FS (preflight verifies).
+- `disagg_lib.sh` — allocation/node resolution (`ensure_allocation`, `resolve_partition`),
+  container-mounts, health-wait, cuda-graph helpers, `load_nodes`/`load_resolved`.
+- `00_setup.sh` — grab/reuse the 2-node allocation (persists `disagg_nodes.env`), ephemeral
+  node fixes, `enroot import` (auto-detects a node-local non-overlay temp), `hf download`. Idempotent.
+- `01_preflight.sh` — detect GPU↔NIC topology + peermem/dmabuf decision + IB-port/RDMA
+  checks on both nodes; writes `$LOG_DIR/disagg_detected.env` (the resolved RDMA truth source).
+- `_detect_rdma.py` — `nvidia-smi topo -m` + `/sys` link_layer → GPU-ordered IB device list.
+- `_check_container_rdma.py` — in-container ibv device count, dmabuf export, optional mooncake probe.
+- `10_launch.sh` — sources detected.env; launch prefill/decode/router, wait healthy, write `disagg_state.env`.
+- `20_benchmark.sh` — sweep via `utils/bench_serving/benchmark_serving.py`, summary table.
+- `teardown.sh` — cancel jobs. `run_all.sh` — full pipeline.
+
+## Environment gotchas (this is Slurm-on-k8s; nodes are nested pods)
+Preflight (`01_preflight.sh`) now **detects or verifies** most of these so a new cluster
+fails in seconds with a clear reason instead of mid-launch:
+1. **enroot import temp must be ext4** (`ENROOT_TEMP_PATH=/scratch/...`) — overlay `/`
+   can't `mknod` the overlayfs whiteouts → `aufs2ovlfs ... Operation not permitted`.
+2. **enroot nvidia hook** patched with `--no-persistenced --no-fabricmanager` (00_setup.sh,
+   needs sudo) — else `nvidia-container-cli` can't bind-mount those sockets in the pod.
+   *This sed-patch is unavoidable on this stack:* enroot 4.0.1 runs system+user `hooks.d`
+   with no basename dedup (a user hook can't replace the system one), and pyxis ignores a
+   per-job `ENROOT_SYSCONF_PATH` redirect — both verified. The patch is idempotent + self-checking.
+3. **RDMA-to-pod** = bind-mount `/dev/infiniband` (in `container_mounts`). Preflight verifies
+   libibverbs sees the same device count as `/sys/class/infiniband`. (Image lacks `ibv_devinfo`/`rdma` CLIs — cosmetic.)
+4. **KV-mem registration path auto-decided** by preflight: `nvidia_peermem` present → default
+   path; absent + driver ≥535 → `WITH_NVIDIA_PEERMEM=0` (mooncake dmabuf). Without the right
+   choice: thousands of `Failed to register memory` → `KVTransferError` on routed requests.
+5. **IB device list auto-detected** (`--disaggregation-ib-device`): GPU↔NIC PCIe affinity from
+   `nvidia-smi topo -m`, filtered to the majority fabric (IB drops the Ethernet storage NICs;
+   RoCE keeps Ethernet). Override via `IB_DEVICES=...`.
+6. **CUDA graph ON** (default) is mandatory for perf — eager mode is ~22x slower with
+   65% client timeouts. (`DISABLE_CUDA_GRAPH=1` only for debugging.)
+7. **Router** runs as an `srun --jobid=<prefill> --overlap` step (Slurm won't co-schedule a
+   fresh job on a full node) and **mounts the model dir** (else tokenizer 404s to HF).
+
+Fixes 1–2 are ephemeral and reset on pod restart — re-run `00_setup.sh`. Detection (3–5) is
+re-run each `01_preflight.sh`, so it self-adjusts on a new cluster/HW.
+
+## Known limitation (current config)
+1P1D throughput saturates ~conc 64 (the single prefill instance is the bottleneck); TTFT
+tail grows sharply with concurrency. For higher throughput / lower TTFT, scale prefill
+instances (NP1D) — a future extension of `10_launch.sh`.
diff --git a/together_runner/slurm-disagg/_check_container_rdma.py b/together_runner/slurm-disagg/_check_container_rdma.py
new file mode 100644
index 000000000..779343cf9
--- /dev/null
+++ b/together_runner/slurm-disagg/_check_container_rdma.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+"""In-container RDMA readiness checks for the disagg KV path. Run inside the
+SGLang image (pyxis) on a GPU node. Verifies, in order:
+
+  1. /dev/infiniband is visible and libibverbs enumerates the HCAs (== the count
+     visible under /sys/class/infiniband) — i.e. the bind-mount worked.
+  2. libibverbs exports ibv_reg_dmabuf_mr (the peermem-free GPU-mem path).
+  3. (optional, --mooncake) the Mooncake transfer engine can register a CUDA
+     buffer with WITH_NVIDIA_PEERMEM=0 — the single decisive check for whether
+     cross-node KV transfer will work. Heavy (allocates GPU mem); opt-in.
+
+Exit 0 if all *required* checks pass. Prints a one-line report per check to
+stderr; machine-readable KEY=VALUE to stdout.
+"""
+import ctypes, glob, os, sys
+
+
+def err(*a):
+    print(*a, file=sys.stderr)
+
+
+def check_ibv():
+    sys_n = len(glob.glob("/sys/class/infiniband/*"))
+    try:
+        ib = ctypes.CDLL("libibverbs.so.1")
+    except OSError as e:
+        err(f"[rdma] FAIL: cannot load libibverbs.so.1: {e}")
+        return False, sys_n, 0, False
+    ib.ibv_get_device_list.restype = ctypes.POINTER(ctypes.c_void_p)
+    n = ctypes.c_int(0)
+    lst = ib.ibv_get_device_list(ctypes.byref(n))
+    cnt = n.value
+    if lst:
+        ib.ibv_free_device_list(lst)
+    dmabuf = hasattr(ib, "ibv_reg_dmabuf_mr")
+    ok = cnt > 0 and cnt == sys_n
+    lvl = "OK" if ok else "FAIL"
+    err(f"[rdma] {lvl}: libibverbs sees {cnt} device(s); /sys shows {sys_n} "
+        f"(/dev/infiniband bind-mount {'works' if cnt>0 else 'MISSING'})")
+    err(f"[rdma] {'OK' if dmabuf else 'WARN'}: ibv_reg_dmabuf_mr "
+        f"{'exported' if dmabuf else 'ABSENT (dmabuf path unavailable)'}")
+    return ok, sys_n, cnt, dmabuf
+
+
+def check_mooncake():
+    """256 MiB CUDA tensor; Mooncake register_memory must return 0 with
+    WITH_NVIDIA_PEERMEM=0 (dmabuf). Returns True/False/None(unavailable)."""
+    os.environ.setdefault("WITH_NVIDIA_PEERMEM", "0")
+    try:
+        import torch
+        from mooncake.engine import TransferEngine  # noqa
+    except Exception as e:
+        err(f"[rdma] SKIP mooncake probe: import failed ({e})")
+        return None
+    try:
+        eng = TransferEngine()
+        # hostname/device auto; minimal init varies by mooncake version — guard.
+        buf = torch.empty(256 * 1024 * 1024 // 4, dtype=torch.float32, device="cuda")
+        ptr = buf.data_ptr()
+        rc = eng.register_memory(ptr, buf.numel() * 4)
+        if rc == 0:
+            eng.unregister_memory(ptr)
+        err(f"[rdma] {'OK' if rc==0 else 'FAIL'}: mooncake register_memory rc={rc} "
+            f"(WITH_NVIDIA_PEERMEM={os.environ['WITH_NVIDIA_PEERMEM']})")
+        return rc == 0
+    except Exception as e:
+        err(f"[rdma] SKIP mooncake probe: engine init failed ({e})")
+        return None
+
+
+def main():
+    want_mooncake = "--mooncake" in sys.argv
+    ok, sys_n, cnt, dmabuf = check_ibv()
+    print(f"IBV_DEVICE_COUNT={cnt}")
+    print(f"SYS_IB_COUNT={sys_n}")
+    print(f"DMABUF_SUPPORTED={'1' if dmabuf else '0'}")
+    if want_mooncake:
+        mc = check_mooncake()
+        print(f"MOONCAKE_DMABUF_OK={'1' if mc else ('0' if mc is False else 'skip')}")
+        if mc is False:
+            ok = False
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/together_runner/slurm-disagg/_detect_rdma.py b/together_runner/slurm-disagg/_detect_rdma.py
new file mode 100644
index 000000000..bd1e347f7
--- /dev/null
+++ b/together_runner/slurm-disagg/_detect_rdma.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""Auto-detect the GPU-adjacent RDMA NIC list for --disaggregation-ib-device.
+
+Portable across clusters/HW (B200/GB200/GB300, IB or RoCE): parses
+`nvidia-smi topo -m` for each GPU's closest NIC (best PCIe relationship), maps
+NIC<k> -> mlx5 name via the topo legend, then keeps only NICs whose port is
+ACTIVE/LinkUp and whose link_layer matches the *majority* fabric among the
+GPU-adjacent NICs (so InfiniBand clusters drop the Ethernet storage NICs, and
+RoCE clusters keep their Ethernet NICs).
+
+Emits to stdout:
+  IB_DEVICES=<comma list in GPU order>     (the value for --disaggregation-ib-device)
+  IB_LINK_LAYER=<InfiniBand|Ethernet>
+and a human-readable report to stderr. Exit non-zero if detection fails so the
+caller can fall back to an explicit IB_DEVICES or abort.
+
+Run on the host node OR inside the container (needs nvidia-smi + /sys/class/infiniband).
+"""
+import os, re, subprocess, sys
+from collections import Counter
+
+SYS_IB = "/sys/class/infiniband"
+# PCIe relationship preference (closest first). PIX = same switch (ideal).
+RANK = {"PIX": 0, "PXB": 1, "PHB": 2, "NODE": 3, "SYS": 4}
+
+
+def err(*a):
+    print(*a, file=sys.stderr)
+
+
+def port_attr(dev, name):
+    try:
+        with open(f"{SYS_IB}/{dev}/ports/1/{name}") as f:
+            return f.read().strip()
+    except OSError:
+        return ""
+
+
+def topo():
+    """Return (gpu_to_nics, nic_to_dev): per-GPU NIC indices ranked best-first,
+    and NIC index -> mlx5 device name (from the legend)."""
+    out = subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True)
+    if out.returncode != 0:
+        raise RuntimeError(f"nvidia-smi topo -m failed: {out.stderr.strip()}")
+    # nvidia-smi underlines the header with ANSI escapes — strip them so the
+    # GPU0/NIC0 labels are plain word tokens.
+    lines = re.sub(r"\x1b\[[0-9;]*m", "", out.stdout).splitlines()
+
+    # NIC legend: "  NIC3: mlx5_3"
+    nic_to_dev = {}
+    for ln in lines:
+        m = re.match(r"\s*NIC(\d+):\s*(\S+)", ln)
+        if m:
+            nic_to_dev[int(m.group(1))] = m.group(2)
+
+    # Header row: locate the column index of each GPU and NIC label.
+    header = None
+    for ln in lines:
+        if re.search(r"\bGPU0\b", ln) and re.search(r"\bNIC0\b", ln):
+            header = ln
+            break
+    if header is None:
+        raise RuntimeError("could not find topo matrix header (GPU0..NIC0)")
+    cols = header.split()  # ['GPU0','GPU1',...,'NIC0',...,'CPU','Affinity',...]
+    col_label = {}  # position-in-cols -> label
+    for i, tok in enumerate(cols):
+        if re.fullmatch(r"GPU\d+", tok) or re.fullmatch(r"NIC\d+", tok):
+            col_label[i] = tok
+
+    gpu_to_nics = {}
+    for ln in lines:
+        toks = ln.split()
+        if not toks or not re.fullmatch(r"GPU\d+", toks[0]):
+            continue
+        gpu = int(toks[0][3:])
+        # The cells align to the header columns after the row label. Cells are
+        # the matrix entries (X / NV# / PIX / PHB / SYS ...). Re-split the data
+        # row the same way as header so column positions line up.
+        # toks[0] is the GPU label; remaining tokens are cells then CPU-affinity.
+        ranked = []
+        for i, tok in enumerate(cols):
+            lbl = col_label.get(i)
+            if not lbl or not lbl.startswith("NIC"):
+                continue
+            # cell for this column = toks[i+1] (row label shifts everything by 1)
+            if i + 1 >= len(toks):
+                continue
+            cell = toks[i + 1]
+            if cell in RANK:
+                ranked.append((RANK[cell], int(lbl[3:])))
+        ranked.sort()
+        gpu_to_nics[gpu] = [n for _, n in ranked]
+    return gpu_to_nics, nic_to_dev
+
+
+def main():
+    try:
+        gpu_to_nics, nic_to_dev = topo()
+    except Exception as e:
+        err(f"[detect_rdma] ERROR: {e}")
+        return 2
+    if not gpu_to_nics:
+        err("[detect_rdma] ERROR: no GPUs found in topo matrix")
+        return 2
+
+    # Collect each GPU's best-rank NIC candidates (those tied at the top rank).
+    candidates = []  # (gpu, [dev,...]) usable RDMA devices at best PCIe rank
+    for gpu in sorted(gpu_to_nics):
+        nics = gpu_to_nics[gpu]
+        devs = []
+        for n in nics:
+            dev = nic_to_dev.get(n)
+            if not dev or not os.path.isdir(f"{SYS_IB}/{dev}"):
+                continue
+            state = port_attr(dev, "state")          # "4: ACTIVE"
+            phys = port_attr(dev, "phys_state")       # "5: LinkUp"
+            ll = port_attr(dev, "link_layer")         # InfiniBand | Ethernet
+            if "ACTIVE" not in state or "LinkUp" not in phys:
+                continue
+            devs.append((dev, ll))
+        candidates.append((gpu, devs))
+
+    # Majority fabric among all GPU-adjacent usable NICs -> drops the off-fabric
+    # NICs (e.g. Ethernet storage NICs on an IB cluster).
+    fabric = Counter(ll for _, devs in candidates for _, ll in devs)
+    if not fabric:
+        err("[detect_rdma] ERROR: no ACTIVE/LinkUp RDMA NIC adjacent to any GPU")
+        return 3
+    majority_ll = fabric.most_common(1)[0][0]
+
+    chosen = []
+    for gpu, devs in candidates:
+        pick = next((d for d, ll in devs if ll == majority_ll), None)
+        if pick is None:
+            err(f"[detect_rdma] WARN: GPU{gpu} has no {majority_ll} NIC at best rank "
+                f"(candidates: {[d for d,_ in devs] or 'none'}) — skipped")
+            continue
+        chosen.append(pick)
+
+    if len(chosen) != len(candidates):
+        err(f"[detect_rdma] WARN: matched {len(chosen)}/{len(candidates)} GPUs to a NIC")
+    if not chosen:
+        err("[detect_rdma] ERROR: no NICs chosen")
+        return 3
+
+    err(f"[detect_rdma] fabric={majority_ll}  per-GPU NIC: " +
+        ", ".join(f"GPU{g}->{d}" for (g, _), d in zip(candidates, chosen)))
+    print(f"IB_DEVICES={','.join(chosen)}")
+    print(f"IB_LINK_LAYER={majority_ll}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/together_runner/slurm-disagg/config.env b/together_runner/slurm-disagg/config.env
new file mode 100644
index 000000000..57cea4856
--- /dev/null
+++ b/together_runner/slurm-disagg/config.env
@@ -0,0 +1,70 @@
+# slurm-disagg — unified config for the 2-node SGLang prefill/decode-disaggregated
+# benchmark on a Slurm + enroot/pyxis cluster. Sourced by every script; override any
+# value by exporting it before sourcing. Cluster-specific bits (partition, nodes, IB
+# devices, peermem path, enroot temp) default to AUTO and are resolved at runtime —
+# the goal is to run unmodified on any such cluster.
+#
+# This file is sourced, not executed. Keep it side-effect free.
+
+# ---- Slurm allocation ----
+# The harness grabs ONE 2-node allocation (salloc --no-shell) and runs prefill/decode/
+# router/bench as overlap steps into it — so Slurm picks the nodes (no node names to
+# hardcode). 00_setup.sh resolves these and persists job id + nodes to disagg_nodes.env.
+# PARTITION empty -> first partition with >=2 idle GPU nodes (override to pin).
+# PREFILL_NODE/DECODE_NODE empty -> assigned by the allocation; set BOTH to force
+#   salloc --nodelist=<prefill>,<decode>.
+export PARTITION="${PARTITION:-}"
+export PREFILL_NODE="${PREFILL_NODE:-}"
+export DECODE_NODE="${DECODE_NODE:-}"
+# B200-specific: 8 GPUs/node, TP across the whole node. Hardcoded on purpose.
+export GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+export TP="${TP:-8}"
+export ALLOC_NAME="${ALLOC_NAME:-pd_disagg}"
+export ALLOC_TIME="${ALLOC_TIME:-04:00:00}"   # allocation walltime (setup+startup+sweep)
+
+# ---- Shared paths: ENROOT_DIR/MODELS_ROOT must be on a CROSS-NODE-SHARED FS (preflight
+# verifies). Default to $HOME (shared on most Slurm/NFS/weka homes); override if not. ----
+export ENROOT_DIR="${ENROOT_DIR:-$HOME/enroot}"
+export SQSH="${SQSH:-$ENROOT_DIR/sglang-dev-cu13.sqsh}"     # shared squashfs image
+export DOCKER_IMAGE="${DOCKER_IMAGE:-docker://lmsysorg/sglang:dev-cu13}"
+export MODELS_ROOT="${MODELS_ROOT:-$HOME/models}"
+export MODEL_DIR="${MODEL_DIR:-$MODELS_ROOT/qwen3-32b}"
+export MODEL_HF_ID="${MODEL_HF_ID:-Qwen/Qwen3-32B}"
+export SERVED_NAME="${SERVED_NAME:-qwen3-32b}"
+export HF_CACHE="${HF_CACHE:-$HOME/.cache/huggingface}"
+export RESULTS_DIR="${RESULTS_DIR:-$ENROOT_DIR/sweep}"
+export LOG_DIR="${LOG_DIR:-$ENROOT_DIR}"
+
+# ---- enroot import temp: MUST be a node-local NON-overlay fs (overlay / can't mknod the
+# overlayfs whiteouts). EMPTY = auto-detected per node by 00_setup.sh (first writable
+# ext4/xfs/btrfs among /scratch /raid /var/tmp /tmp ...). Set to pin a specific path. ----
+export ENROOT_SCRATCH="${ENROOT_SCRATCH:-}"
+
+# ---- Serving ----
+export PORT="${PORT:-30000}"                 # prefill/decode OpenAI port (each on its own node)
+export BOOTSTRAP_PORT="${BOOTSTRAP_PORT:-9000}"  # prefill KV bootstrap server
+export ROUTER_PORT="${ROUTER_PORT:-8002}"    # sgl-router (on prefill node)
+# GPU-adjacent IB devices for --disaggregation-ib-device, in GPU order.
+# EMPTY = auto-detect in 01_preflight.sh (nvidia-smi topo -m + /sys link_layer);
+# set explicitly to override (e.g. "mlx5_9,mlx5_10,..."). Resolved value lands in
+# $LOG_DIR/disagg_detected.env. On this box auto-detect yields mlx5_9,10,11,12,4,5,6,7.
+export IB_DEVICES="${IB_DEVICES:-}"
+# KV-cache GPU-mem registration path. EMPTY = auto-decide in 01_preflight.sh:
+# nvidia_peermem present -> leave unset (peermem path); absent + driver>=535 ->
+# set 0 (force Mooncake dmabuf path). Set explicitly to override.
+export WITH_NVIDIA_PEERMEM="${WITH_NVIDIA_PEERMEM:-}"
+# CUDA graph is MANDATORY for perf (eager = ~22x slower, 65% client timeouts).
+export DISABLE_CUDA_GRAPH="${DISABLE_CUDA_GRAPH:-0}"
+
+# ---- Benchmark ----
+export ISL="${ISL:-1024}"
+export OSL="${OSL:-1024}"
+export CONC_LIST="${CONC_LIST:-16 64 128 256}"
+export PROMPTS_PER_CONC="${PROMPTS_PER_CONC:-8}"   # num_prompts = conc * this (min 160)
+export REPO_ROOT="${REPO_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
+
+# ---- Node/IP resolution lives in disagg_lib.sh (resolve_nodes) + persists to
+# $LOG_DIR/disagg_nodes.env; scripts call load_resolved() after sourcing this file.
+# PREFILL_IP/DECODE_IP are filled there once nodes are known. ----
+export PREFILL_IP="${PREFILL_IP:-}"
+export DECODE_IP="${DECODE_IP:-}"
diff --git a/together_runner/slurm-disagg/disagg_lib.sh b/together_runner/slurm-disagg/disagg_lib.sh
new file mode 100755
index 000000000..f11f99575
--- /dev/null
+++ b/together_runner/slurm-disagg/disagg_lib.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Shared helpers for slurm-disagg. Source AFTER config.env.
+
+trlog()  { echo "[disagg $(date +%H:%M:%S)] $*"; }
+trerr()  { echo "[disagg $(date +%H:%M:%S)] ERROR: $*" >&2; }
+
+# Source persisted node/allocation resolution (ALLOC_JOB + nodes + IPs).
+load_nodes() { [[ -f "$LOG_DIR/disagg_nodes.env" ]] && source "$LOG_DIR/disagg_nodes.env"; return 0; }
+
+# load_nodes + the detected RDMA config (IB_DEVICES + WITH_NVIDIA_PEERMEM). Used by
+# launch/benchmark. NOT used by 01_preflight (which *produces* detected.env and must
+# re-derive from clean config defaults each run).
+load_resolved() {
+    load_nodes
+    [[ -f "$LOG_DIR/disagg_detected.env" ]] && source "$LOG_DIR/disagg_detected.env"
+    return 0
+}
+
+# Routable IP for a Slurm node: hostname resolution first, then Slurm's NodeAddr.
+ip_of() {
+    local ip; ip="$(getent hosts "$1" 2>/dev/null | awk '{print $1; exit}')"
+    [[ -z "$ip" ]] && ip="$(scontrol show node "$1" 2>/dev/null | grep -oE 'NodeAddr=[^ ]+' | cut -d= -f2)"
+    echo "$ip"
+}
+
+# True if Slurm job $1 is currently allocated (pending/running/configuring).
+alloc_alive() { [[ -n "${1:-}" ]] && squeue -h -j "$1" -o '%t' 2>/dev/null | grep -qE 'R|PD|CF'; }
+
+# Pick PARTITION if unset: first partition with >=2 fully-idle GPU nodes.
+resolve_partition() {
+    [[ -n "$PARTITION" ]] && return 0
+    declare -A cnt
+    local node part gres st g
+    while IFS='|' read -r node part gres st; do
+        part="${part%\*}"; [[ "$st" == "idle" ]] || continue
+        g="$(grep -oE 'gpu(:[^:,]+)*:[0-9]+' <<<"$gres" | grep -oE '[0-9]+$' | head -1)"
+        [[ -n "$g" && "$g" -gt 0 ]] && cnt[$part]=$(( ${cnt[$part]:-0} + 1 ))
+    done < <(sinfo -h -N -o '%N|%P|%G|%t' 2>/dev/null)
+    for part in "${!cnt[@]}"; do [[ ${cnt[$part]} -ge 2 ]] && { PARTITION="$part"; break; }; done
+    [[ -n "$PARTITION" ]] || { trerr "no partition with >=2 idle GPU nodes (set PARTITION)"; return 1; }
+    trlog "auto-selected PARTITION=$PARTITION"
+}
+
+# Ensure a live 2-node allocation; set ALLOC_JOB/PREFILL_NODE/DECODE_NODE/IPs and persist
+# to $LOG_DIR/disagg_nodes.env. Idempotent: reuses the persisted allocation if still alive.
+ensure_allocation() {
+    if alloc_alive "${ALLOC_JOB:-}"; then trlog "reusing allocation $ALLOC_JOB"; else
+        resolve_partition || return 1
+        local nlflag=""
+        [[ -n "$PREFILL_NODE" && -n "$DECODE_NODE" ]] && nlflag="--nodelist=$PREFILL_NODE,$DECODE_NODE"
+        trlog "requesting 2-node allocation on '$PARTITION' (gpu:$GPUS_PER_NODE/node, t=$ALLOC_TIME) ..."
+        local out; out="$(salloc -N2 --gres=gpu:"$GPUS_PER_NODE" -p "$PARTITION" --no-shell \
+            -J "$ALLOC_NAME" --time="$ALLOC_TIME" $nlflag 2>&1)" || { trerr "salloc failed: $out"; return 1; }
+        ALLOC_JOB="$(grep -oE 'job allocation [0-9]+' <<<"$out" | grep -oE '[0-9]+' | head -1)"
+        [[ -n "$ALLOC_JOB" ]] || { trerr "could not parse job id from: $out"; return 1; }
+    fi
+    # Discover the assigned nodes from the allocation (Slurm-ordered).
+    local nl; nl="$(squeue -h -j "$ALLOC_JOB" -o '%N' 2>/dev/null)"
+    mapfile -t NODES < <(scontrol show hostnames "$nl" 2>/dev/null)
+    [[ ${#NODES[@]} -ge 2 ]] || { trerr "allocation $ALLOC_JOB has <2 nodes ($nl)"; return 1; }
+    PREFILL_NODE="${PREFILL_NODE:-${NODES[0]}}"
+    DECODE_NODE="${DECODE_NODE:-${NODES[1]}}"
+    PREFILL_IP="$(ip_of "$PREFILL_NODE")"; DECODE_IP="$(ip_of "$DECODE_NODE")"
+    [[ -n "$PREFILL_IP" && -n "$DECODE_IP" ]] || { trerr "could not resolve node IPs ($PREFILL_NODE/$DECODE_NODE)"; return 1; }
+
+    mkdir -p "$LOG_DIR"
+    cat > "$LOG_DIR/disagg_nodes.env" <<EOF
+# Auto-resolved by 00_setup.sh (ensure_allocation). teardown.sh removes this.
+ALLOC_JOB=$ALLOC_JOB
+PARTITION=$PARTITION
+PREFILL_NODE=$PREFILL_NODE
+DECODE_NODE=$DECODE_NODE
+GPUS_PER_NODE=$GPUS_PER_NODE
+TP=$TP
+PREFILL_IP=$PREFILL_IP
+DECODE_IP=$DECODE_IP
+EOF
+    export ALLOC_JOB PARTITION PREFILL_NODE DECODE_NODE GPUS_PER_NODE TP PREFILL_IP DECODE_IP
+    trlog "allocation $ALLOC_JOB: prefill=$PREFILL_NODE($PREFILL_IP) decode=$DECODE_NODE($DECODE_IP)"
+}
+
+# srun a step INTO the allocation (overlap). Usage: alloc_step <node> [extra srun args] -- <cmd...>
+# (kept as a helper; scripts mostly inline srun --jobid=$ALLOC_JOB --overlap for clarity.)
+
+# Common pyxis container-mounts. Pass extra mounts as $1 (comma-prefixed or empty).
+# /dev/infiniband is REQUIRED for RDMA-to-pod (KV transfer). HF cache + model + logs.
+container_mounts() {
+    local extra="${1:-}"
+    echo "${MODEL_DIR}:${MODEL_DIR},${LOG_DIR}:${LOG_DIR},${HF_CACHE}:/root/.cache/huggingface,/dev/infiniband:/dev/infiniband${extra:+,$extra}"
+}
+
+# Poll an HTTP /health until 200, a process/job dies, or timeout.
+# wait_health <url> <timeout_s> [<jobid_to_watch>]
+wait_health() {
+    local url="$1" timeout="${2:-1800}" watch_job="${3:-}" t0 i=0
+    t0=$(date +%s)
+    while :; do
+        if curl -sf -m4 "$url" >/dev/null 2>&1; then return 0; fi
+        if [[ -n "$watch_job" ]] && ! squeue -h -j "$watch_job" >/dev/null 2>&1; then
+            trerr "watched job $watch_job exited before $url became healthy"; return 1
+        fi
+        (( $(date +%s) - t0 > timeout )) && { trerr "timeout waiting for $url"; return 1; }
+        sleep 5; (( i++ ))
+    done
+}
+
+# cuda-graph flag: by default ON (empty); DISABLE_CUDA_GRAPH=1 adds the flag.
+cuda_graph_arg() { [[ "${DISABLE_CUDA_GRAPH:-0}" == "1" ]] && echo "--disable-cuda-graph" || true; }
diff --git a/together_runner/slurm-disagg/run_all.sh b/together_runner/slurm-disagg/run_all.sh
new file mode 100755
index 000000000..7158b713b
--- /dev/null
+++ b/together_runner/slurm-disagg/run_all.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# One-shot end-to-end: setup (idempotent) -> launch -> benchmark sweep.
+# Leaves the endpoint running; run teardown.sh when done.
+#
+#   bash run_all.sh
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+bash "$HERE/00_setup.sh"
+bash "$HERE/01_preflight.sh"
+bash "$HERE/10_launch.sh"
+bash "$HERE/20_benchmark.sh"
+echo
+echo "End-to-end done. Endpoint still up — see $(. "$HERE/config.env"; echo "$LOG_DIR/disagg_state.env")."
+echo "Teardown: bash $HERE/teardown.sh"
diff --git a/together_runner/slurm-disagg/teardown.sh b/together_runner/slurm-disagg/teardown.sh
new file mode 100755
index 000000000..075451a1f
--- /dev/null
+++ b/together_runner/slurm-disagg/teardown.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Tear down the disagg endpoint: cancel the 2-node allocation (its prefill/decode/router
+# steps die with it). Removes the node + state files so the next run re-allocates fresh.
+# (image, weights, detected RDMA config preserved.)
+#
+#   bash teardown.sh
+set -uo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$HERE/config.env"; source "$HERE/disagg_lib.sh"
+load_resolved
+
+JOB="${ALLOC_JOB:-}"
+[[ -z "$JOB" ]] && JOB="$(squeue --me -h -n "$ALLOC_NAME" -o '%i' 2>/dev/null | head -1)"
+if [[ -n "$JOB" ]]; then
+    trlog "scancel allocation $JOB"; scancel "$JOB" 2>/dev/null || true
+else
+    trlog "no allocation found to cancel."
+fi
+rm -f "$LOG_DIR/disagg_state.env" "$LOG_DIR/disagg_nodes.env"
+trlog "torn down. (image, weights, detected.env preserved)"