From fe4cb3368ab102c9a70d481f35ee7f4b48cc08df Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 14 Jun 2026 19:12:12 -0400
Subject: [PATCH 01/20] [Klaud Cold] minimaxm3-fp8-mi355x-vllm-disagg: day-zero
 MoRI-IO disagg smoke test

MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the
day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3): 1 prefill (TP8) +
1 decode (TP8) at conc 1, validating the MoRI-IO KV-transfer disagg pipeline
end-to-end for M3.

Layered on the MoRI-IO patch-removal infra (#1585): brings in that PR's
amd_utils changes (setup_deps.sh / server_vllm.sh / submit.sh / models_vllm.yaml
mori -> mori_low_latency) and the two job.slurm hunks (vllm-router image bump
nightly-20260511 -> nightly-20260603, drop VLLM_MORIIO_CONNECTOR_READ_MODE env),
while keeping main's atom-disagg support intact.

Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128
(MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend
TRITON_ATTN, minimax_m3 tool/reasoning parsers; no EP (TP8, MoE experts
TP-sharded as in the single-node M3 TP8 recipe).

perf-changelog.yaml and amd-master.yaml contain only M3 changes.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |  37 ++
 benchmarks/multi_node/amd_utils/job.slurm     |   3 +-
 .../multi_node/amd_utils/models_vllm.yaml     |  15 +-
 .../multi_node/amd_utils/server_vllm.sh       |   6 +-
 benchmarks/multi_node/amd_utils/setup_deps.sh | 559 +-----------------
 benchmarks/multi_node/amd_utils/submit.sh     |   1 -
 .../minimaxm3_fp8_mi355x_vllm-disagg.sh       |  78 +++
 perf-changelog.yaml                           |  12 +
 8 files changed, 145 insertions(+), 566 deletions(-)
 create mode 100644 benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5e5452c4c..d7433e4d1 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2729,3 +2729,40 @@ minimaxm3-fp8-mi325x-vllm-mtp:
       - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
       - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+
+# MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the
+# day-zero ROCm image. Minimal 1 prefill (TP8) + 1 decode (TP8) at conc 1 to
+# validate the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on
+# the MoRI-patch-removal infra (#1585). No EP (TP8 only); MoE experts are
+# TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in
+# benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8).
+minimaxm3-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 17f5b4f54..67160c262 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -316,7 +316,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
 export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 
 # vLLM external router container
-VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}"
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260603-e667ebb}"
 ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"
 
@@ -401,7 +401,6 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
         -e UCX_LOG_LEVEL=warn
         -e HSA_ENABLE_SDMA=1
         -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
-        -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
         -e PYTHONPYCACHEPREFIX=/tmp/pycache
     )
 elif [[ "$ENGINE" == "atom-disagg" ]]; then
diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml
index b051de8d9..e78b6c647 100644
--- a/benchmarks/multi_node/amd_utils/models_vllm.yaml
+++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml
@@ -26,15 +26,15 @@ amd-Llama-3.3-70B-Instruct-FP8-KV:
 
 Kimi-K2.5-MXFP4:
   prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
-  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
   hf_dir: "models--amd--Kimi-K2.5-MXFP4"
 
 MiniMax-M2.5:
   # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup.
   # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
-  prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
-  decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1"
   hf_dir: "models--MiniMaxAI--MiniMax-M2.5"
 
@@ -42,3 +42,12 @@ gpt-oss-120b:
   prefill_flags: "--tensor-parallel-size 8"
   decode_flags: "--tensor-parallel-size 8"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"
+
+MiniMax-M3-MXFP8:
+  # MiniMax-M3 MXFP8 disagg smoke test (TP8 prefill + TP8 decode, no EP).
+  # --block-size 128 is mandatory (MSA sparse/index cache); text-only benchmark
+  # so --language-model-only frees the vision encoder. gfx950 uses FP8 KV cache.
+  prefill_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice"
+  decode_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_USE_BREAKABLE_CUDAGRAPH=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--MiniMaxAI--MiniMax-M3-MXFP8"
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
index d61fe0359..f02b1cd56 100755
--- a/benchmarks/multi_node/amd_utils/server_vllm.sh
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -256,7 +256,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${PREFILL_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
@@ -422,7 +422,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${PREFILL_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
@@ -478,7 +478,7 @@ else
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${DECODE_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
index add2e3fa5..35eaf17dc 100644
--- a/benchmarks/multi_node/amd_utils/setup_deps.sh
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -3,8 +3,8 @@
 # setup_deps.sh — Install missing disagg dependencies at container start.
 #
 # Dispatched by $ENGINE (set by server.sh dispatcher):
-#   vllm-disagg   -> vLLM/MoRI-IO patches + UCX/RIXL path exports
-#                    (base image: vllm/vllm-openai-rocm:v0.18.0)
+#   vllm-disagg   -> recipe deps + amd-quark + UCX/RIXL path exports
+#                    (base image: vllm/vllm-openai-rocm:nightly)
 #   sglang-disagg -> SGLang aiter gluon patch + per-model installs
 #                    (base image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-*)
 #
@@ -79,556 +79,6 @@ install_amd_quark() {
     _SETUP_INSTALLED+=("amd-quark")
 }
 
-# ---------------------------------------------------------------------------
-# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock)
-#    In WRITE mode, save_kv_layer spins forever waiting for the handshake
-#    callback to set write_ready_flags. This blocks the model worker thread,
-#    preventing it from responding to EngineCore shm_broadcast, causing a
-#    TimeoutError cascade and crash.
-#    Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent
-#    the model worker from deadlocking.
-# ---------------------------------------------------------------------------
-patch_moriio_save_kv_timeout() {
-    python3 -c '
-import os, sys
-
-try:
-    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc
-    f = mc.__file__
-    src = open(f).read()
-
-    # Already patched?
-    if "[PATCHED] save_kv_layer timeout" in src:
-        print("[SETUP] save_kv_layer timeout patch already applied")
-        sys.exit(0)
-
-    old = """        while True:
-            if (
-                self._ready_requests.empty()
-                and remote_engine_id not in self.write_ready_flags
-            ):
-                continue"""
-
-    if old not in src:
-        print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch")
-        sys.exit(0)
-
-    new = """        # [PATCHED] save_kv_layer — null guard + timeout + sleep
-        if remote_engine_id is None:
-            return
-        import time as _time, os as _os
-        _wait_start = _time.monotonic()
-        _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30"))
-        while True:
-            if (
-                self._ready_requests.empty()
-                and remote_engine_id not in self.write_ready_flags
-            ):
-                _elapsed = _time.monotonic() - _wait_start
-                if _elapsed > _SAVE_KV_TIMEOUT:
-                    import logging as _logging
-                    _logging.getLogger("vllm.moriio").warning(
-                        "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for "
-                        "write_ready_flags[%s], breaking to unblock model "
-                        "worker", _elapsed, remote_engine_id)
-                    break
-                _time.sleep(0.001)
-                continue"""
-
-    new_src = src.replace(old, new)
-    if new_src == src:
-        print("[SETUP] WARN: replacement had no effect")
-        sys.exit(0)
-
-    open(f, "w").write(new_src)
-    print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep")
-except Exception as e:
-    print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr)
-'
-    _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch")
-}
-
-# ---------------------------------------------------------------------------
-# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout
-#    The original status.Wait() blocks forever if an RDMA completion never
-#    arrives (e.g., NIC queue saturation at C256). This replaces the unbounded
-#    wait with a polling loop using status.Succeeded() + configurable timeout.
-#    Also adds error handling to the write worker loop so a single failed
-#    transfer doesn't kill the background thread.
-# ---------------------------------------------------------------------------
-patch_moriio_transfer_timeout() {
-    python3 -c '
-import os, sys, textwrap
-
-try:
-    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me
-    f = me.__file__
-    src = open(f).read()
-
-    if "[PATCHED] transfer completion timeout" in src:
-        print("[SETUP] transfer completion timeout patch already applied")
-        sys.exit(0)
-
-    # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout ---
-    old_wait = """    def waiting_for_transfer_complete(self):
-        if not self.transfer_status:
-            return
-
-        transfers_to_wait = []
-        with self.lock:
-            transfers_to_wait = self.transfer_status[:]
-            self.transfer_status.clear()
-
-        for status in transfers_to_wait:
-            try:
-                status.Wait()
-                if not status.Succeeded():
-                    logger.error(
-                        "Transfer failed: %s, Code: %s", status.Message(), status.Code()
-                    )
-                    raise TransferError("MoRIIO transfer failed!")
-            except Exception as e:
-                logger.error("Transfer %s failed: %s", status, e)
-                raise"""
-
-    new_wait = """    def waiting_for_transfer_complete(self):
-        # [PATCHED] transfer completion timeout — bounded polling loop
-        import time as _time, os as _os
-        if not self.transfer_status:
-            return
-
-        _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120"))
-
-        transfers_to_wait = []
-        with self.lock:
-            transfers_to_wait = self.transfer_status[:]
-            self.transfer_status.clear()
-
-        _start = _time.monotonic()
-        remaining = list(transfers_to_wait)
-        _polls = 0
-        _completed = 0
-
-        while remaining:
-            _elapsed = _time.monotonic() - _start
-            if _elapsed > _timeout:
-                logger.error(
-                    "[HANGFIX] transfer_timeout elapsed=%.1fs "
-                    "pending=%d/%d completed=%d polls=%d "
-                    "action=raise_transfer_error",
-                    _elapsed, len(remaining), len(transfers_to_wait),
-                    _completed, _polls,
-                )
-                raise TransferError(
-                    f"RDMA transfer timeout after {_elapsed:.1f}s, "
-                    f"{len(remaining)}/{len(transfers_to_wait)} pending"
-                )
-
-            still_waiting = []
-            for status in remaining:
-                try:
-                    if status.Succeeded():
-                        _completed += 1
-                        continue
-                    still_waiting.append(status)
-                except Exception as e:
-                    logger.error(
-                        "[HANGFIX] transfer_poll_error error=%s", e)
-                    raise TransferError(
-                        f"Transfer failed during poll: {e}"
-                    ) from e
-
-            remaining = still_waiting
-            if remaining:
-                _time.sleep(0.005)
-                _polls += 1
-                if _polls % 2000 == 0:
-                    logger.warning(
-                        "[HANGFIX] transfer_wait pending=%d "
-                        "completed=%d elapsed=%.1fs timeout=%.0fs",
-                        len(remaining), _completed,
-                        _time.monotonic() - _start, _timeout,
-                    )"""
-
-    if old_wait not in src:
-        print("[SETUP] WARN: waiting_for_transfer_complete pattern not found")
-        sys.exit(0)
-
-    new_src = src.replace(old_wait, new_wait)
-
-    # --- Patch 2: Add error handling + cleanup to _write_worker_loop ---
-    old_loop = """            self._execute_write_task(task)"""
-
-    new_loop = """            try:
-                self._execute_write_task(task)
-            except Exception as _e:
-                logger.error(
-                    "[HANGFIX] req=%s write_task_failed error=%s "
-                    "action=cleanup_and_mark_done",
-                    task.request_id, _e,
-                )
-                try:
-                    _wr = self.worker.moriio_wrapper
-                    with _wr.lock:
-                        _wr.done_req_ids.append(task.request_id)
-                    _wr.done_remote_allocate_req_dict.pop(
-                        task.request_id, None
-                    )
-                except Exception:
-                    pass"""
-
-    if old_loop in new_src:
-        new_src = new_src.replace(old_loop, new_loop, 1)
-    else:
-        print("[SETUP] WARN: _write_worker_loop pattern not found for error handling")
-
-    # --- Patch 3: Add deferred task timeout to _process_deferred_tasks ---
-    old_deferred = """    def _process_deferred_tasks(self) -> None:
-        \"\"\"Process tasks that were previously deferred.\"\"\"
-        if not self._deferred_tasks:
-            return
-
-        still_deferred: list[WriteTask] = []
-        for task in self._deferred_tasks:
-            if self._is_remote_ready(task):
-                self._execute_write_task(task)
-            else:
-                still_deferred.append(task)
-
-        self._deferred_tasks = still_deferred"""
-
-    new_deferred = """    def _process_deferred_tasks(self) -> None:
-        \"\"\"Process tasks that were previously deferred.\"\"\"
-        # [PATCHED] deferred task timeout — prune stale tasks
-        import time as _time, os as _os
-        if not self._deferred_tasks:
-            return
-
-        _DEFER_TIMEOUT = float(
-            _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60"))
-
-        still_deferred: list[WriteTask] = []
-        for task in self._deferred_tasks:
-            _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic())
-            if _age > _DEFER_TIMEOUT:
-                logger.error(
-                    "[HANGFIX] req=%s deferred_task_expired age=%.1fs "
-                    "action=drop_and_mark_done",
-                    task.request_id, _age,
-                )
-                try:
-                    _wr = self.worker.moriio_wrapper
-                    with _wr.lock:
-                        _wr.done_req_ids.append(task.request_id)
-                    _wr.done_remote_allocate_req_dict.pop(
-                        task.request_id, None)
-                except Exception:
-                    pass
-                continue
-            if self._is_remote_ready(task):
-                try:
-                    self._execute_write_task(task)
-                except Exception as _e:
-                    logger.error(
-                        "[HANGFIX] req=%s deferred_write_failed error=%s",
-                        task.request_id, _e,
-                    )
-                    try:
-                        _wr = self.worker.moriio_wrapper
-                        with _wr.lock:
-                            _wr.done_req_ids.append(task.request_id)
-                        _wr.done_remote_allocate_req_dict.pop(
-                            task.request_id, None)
-                    except Exception:
-                        pass
-            else:
-                still_deferred.append(task)
-
-        self._deferred_tasks = still_deferred"""
-
-    if old_deferred in new_src:
-        new_src = new_src.replace(old_deferred, new_deferred, 1)
-    else:
-        print("[SETUP] WARN: _process_deferred_tasks pattern not found")
-
-    # --- Patch 4: Stamp defer time when task is deferred ---
-    old_defer_add = """                self._deferred_tasks.append(task)"""
-    new_defer_add = """                import time as _time2
-                if not hasattr(task, "_defer_ts"):
-                    task._defer_ts = _time2.monotonic()
-                self._deferred_tasks.append(task)"""
-    if old_defer_add in new_src:
-        new_src = new_src.replace(old_defer_add, new_defer_add, 1)
-    else:
-        print("[SETUP] WARN: deferred task timestamp patch target not found")
-
-    open(f, "w").write(new_src)
-    print("[SETUP] Patched: transfer timeout + writer error handling")
-
-except Exception as e:
-    print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr)
-'
-    _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch")
-}
-
-# ---------------------------------------------------------------------------
-# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer)
-#     The READ-mode spin loop in start_load_kv has the same unbounded-spin
-#     issue as save_kv_layer. Add timeout + sleep + null guard.
-# ---------------------------------------------------------------------------
-patch_moriio_load_kv_timeout() {
-    python3 -c '
-import os, sys
-
-try:
-    import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc
-    f = mc.__file__
-    src = open(f).read()
-
-    if "[PATCHED] start_load_kv timeout" in src:
-        print("[SETUP] start_load_kv timeout patch already applied")
-        sys.exit(0)
-
-    old = """        while True:
-            if (
-                self._ready_requests.empty()
-                and remote_engine_id not in self.load_ready_flag
-                and wait_handshake_readd_req
-            ):
-                continue"""
-
-    if old not in src:
-        print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping")
-        sys.exit(0)
-
-    new = """        # [PATCHED] start_load_kv timeout — prevent model worker deadlock
-        if remote_engine_id is None and not wait_handshake_readd_req:
-            self._reqs_to_send.update(metadata.reqs_to_send)
-            return
-        import time as _time, os as _os
-        _wait_start = _time.monotonic()
-        _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30"))
-        while True:
-            if (
-                self._ready_requests.empty()
-                and remote_engine_id not in self.load_ready_flag
-                and wait_handshake_readd_req
-            ):
-                if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT:
-                    import logging as _logging
-                    _logging.getLogger("vllm.moriio").warning(
-                        "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for "
-                        "load_ready_flag[%s]", _time.monotonic() - _wait_start,
-                        remote_engine_id)
-                    break
-                _time.sleep(0.001)
-                continue"""
-
-    new_src = src.replace(old, new)
-    if new_src == src:
-        print("[SETUP] WARN: start_load_kv replacement had no effect")
-        sys.exit(0)
-
-    open(f, "w").write(new_src)
-    print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep")
-except Exception as e:
-    print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr)
-'
-    _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch")
-}
-
-# ---------------------------------------------------------------------------
-# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished
-#     vLLM asserts that a request in finished_recving must be either
-#     WAITING_FOR_REMOTE_KVS or finished.  In READ mode the request can
-#     transition to RUNNING before the aggregated recv notification arrives,
-#     crashing the engine with AssertionError.
-#     (present in v0.17.1 & v0.18.0)
-# ---------------------------------------------------------------------------
-patch_scheduler_read_mode_fix() {
-    python3 -c '
-import os, sys
-
-try:
-    import vllm.v1.core.sched.scheduler as smod
-    f = smod.__file__
-    src = open(f).read()
-
-    if "[PATCHED] read-mode recv assertion" in src:
-        print("[SETUP] scheduler read-mode assertion fix already applied")
-        sys.exit(0)
-
-    old_recv = """        for req_id in kv_connector_output.finished_recving or ():
-            logger.debug("Finished recving KV transfer for request %s", req_id)
-            assert req_id in self.requests
-            req = self.requests[req_id]
-            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
-                self.finished_recving_kv_req_ids.add(req_id)
-            else:
-                assert RequestStatus.is_finished(req.status)
-                self._free_blocks(self.requests[req_id])"""
-
-    new_recv = """        # [PATCHED] read-mode recv assertion — handle intermediate states
-        for req_id in kv_connector_output.finished_recving or ():
-            logger.debug("Finished recving KV transfer for request %s", req_id)
-            if req_id not in self.requests:
-                logger.debug("Request %s already removed, skipping recv", req_id)
-                continue
-            req = self.requests[req_id]
-            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
-                self.finished_recving_kv_req_ids.add(req_id)
-            elif RequestStatus.is_finished(req.status):
-                self._free_blocks(self.requests[req_id])
-            else:
-                logger.debug(
-                    "Request %s recv finished but status=%s (not "
-                    "WAITING_FOR_REMOTE_KVS or finished), skipping "
-                    "block free — will be freed on request completion",
-                    req_id, req.status.name)"""
-
-    if old_recv not in src:
-        print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping")
-        sys.exit(0)
-
-    new_src = src.replace(old_recv, new_recv, 1)
-
-    old_send = """        for req_id in kv_connector_output.finished_sending or ():
-            logger.debug("Finished sending KV transfer for request %s", req_id)
-            assert req_id in self.requests
-            self._free_blocks(self.requests[req_id])"""
-
-    new_send = """        for req_id in kv_connector_output.finished_sending or ():
-            logger.debug("Finished sending KV transfer for request %s", req_id)
-            if req_id not in self.requests:
-                logger.debug("Request %s already removed, skipping send", req_id)
-                continue
-            self._free_blocks(self.requests[req_id])"""
-
-    if old_send in new_src:
-        new_src = new_src.replace(old_send, new_send, 1)
-    else:
-        print("[SETUP] WARN: scheduler finished_sending pattern not found")
-
-    open(f, "w").write(new_src)
-    print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix")
-
-except Exception as e:
-    print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr)
-'
-    _SETUP_INSTALLED+=("scheduler-read-mode-fix")
-}
-
-# ---------------------------------------------------------------------------
-# 12. Idle KV block reaper for disaggregated prefill (READ mode)
-#     The RIXL notification path can lose `finished_sending` signals under
-#     high concurrency with ibv_post_send failures. This leaves KV blocks
-#     permanently allocated on the prefill engine even after the decode has
-#     finished reading. Over multiple benchmark rounds, leaked blocks
-#     accumulate and eventually saturate the prefill KV cache.
-#
-#     Fix: instrument the scheduler's `schedule()` method to detect idle
-#     periods (0 running, 0 waiting for >5s) and force-free blocks for
-#     any remaining requests whose status is finished.
-# ---------------------------------------------------------------------------
-patch_prefill_idle_kv_reaper() {
-    python3 -c '
-import os, sys
-
-try:
-    import vllm.v1.core.sched.scheduler as smod
-    f = smod.__file__
-    src = open(f).read()
-
-    if "[PATCHED] idle-kv-reaper" in src:
-        print("[SETUP] idle KV block reaper already applied")
-        sys.exit(0)
-
-    # Find the _update_from_kv_xfer_finished method end and add reaper logic
-    # We inject into the method that processes KV transfer completions.
-    marker = "[PATCHED] read-mode recv assertion"
-    if marker not in src:
-        print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper")
-        sys.exit(0)
-
-    # Add reaper state initialization to __init__
-    old_init_marker = "self.finished_recving_kv_req_ids"
-    if old_init_marker not in src:
-        print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler")
-        sys.exit(0)
-
-    # Find the first occurrence to insert reaper state
-    init_pos = src.find(old_init_marker)
-    # Find the line containing it
-    line_end = src.find("\n", init_pos)
-    init_line = src[init_pos:line_end]
-
-    # Add reaper state after this line
-    reaper_init = init_line + """
-        # [PATCHED] idle-kv-reaper state
-        self._idle_kv_reaper_ts = 0.0
-        self._idle_kv_reaper_active = False"""
-
-    src = src.replace(init_line, reaper_init, 1)
-
-    # Now add the reaper logic at the end of _update_from_kv_xfer_finished
-    # Find the finished_sending handler we patched
-    send_handler = """        for req_id in kv_connector_output.finished_sending or ():
-            logger.debug("Finished sending KV transfer for request %s", req_id)
-            if req_id not in self.requests:
-                logger.debug("Request %s already removed, skipping send", req_id)
-                continue
-            self._free_blocks(self.requests[req_id])"""
-
-    reaper_logic = send_handler + """
-
-        # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks
-        import time as _time
-        _REAPER_IDLE_SECS = 5.0
-        _num_running = sum(1 for r in self.requests.values()
-                          if r.status == RequestStatus.RUNNING)
-        _should_reap = (_num_running == 0)
-
-        if _should_reap:
-            if not self._idle_kv_reaper_active:
-                self._idle_kv_reaper_active = True
-                self._idle_kv_reaper_ts = _time.monotonic()
-            elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS:
-                _reaped = 0
-                _reap_ids = []
-                for _rid, _req in list(self.requests.items()):
-                    if RequestStatus.is_finished(_req.status):
-                        _reap_ids.append(_rid)
-                for _rid in _reap_ids:
-                    try:
-                        _req = self.requests[_rid]
-                        self._free_blocks(_req)
-                        _reaped += 1
-                    except Exception as _e:
-                        logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e)
-                if _reaped > 0:
-                    logger.warning(
-                        "[KV-REAPER] Force-freed blocks for %d finished "
-                        "requests after %.1fs idle",
-                        _reaped, _time.monotonic() - self._idle_kv_reaper_ts)
-                self._idle_kv_reaper_ts = _time.monotonic()
-        else:
-            self._idle_kv_reaper_active = False"""
-
-    if send_handler in src:
-        src = src.replace(send_handler, reaper_logic, 1)
-    else:
-        print("[SETUP] WARN: send handler not found for reaper injection")
-        sys.exit(0)
-
-    open(f, "w").write(src)
-    print("[SETUP] Patched: idle KV block reaper for prefill")
-
-except Exception as e:
-    print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr)
-'
-    _SETUP_INSTALLED+=("idle-kv-reaper")
-}
-
 # ---------------------------------------------------------------------------
 # SGLang: Patch aiter gluon pa_mqa_logits — fix 2D → 3D instr_shape for
 # Triton ≥ 3.5.
@@ -742,11 +192,6 @@ install_transformers_glm5() {
 if [[ "$ENGINE" == "vllm-disagg" ]]; then
     install_recipe_deps
     install_amd_quark
-    patch_moriio_save_kv_timeout
-    patch_moriio_transfer_timeout
-    patch_moriio_load_kv_timeout
-    patch_scheduler_read_mode_fix
-    patch_prefill_idle_kv_reaper
 
     # =========================================================================
     # vLLM: Export UCX/RIXL paths (persists since this file is sourced)
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index fa3d65418..fc91a78e8 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -102,7 +102,6 @@ export PROFILER_ARGS=$profiler_args
 # Engine-specific xP/yD semantics and TP exports
 if [[ "$ENGINE" == "vllm-disagg" ]]; then
     export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300}
-    export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
 fi
 # xP = prefill workers, yD = decode workers (may span multiple nodes)
 export xP=$PREFILL_WORKERS
diff --git a/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh
new file mode 100644
index 000000000..a9a28d889
--- /dev/null
+++ b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh
@@ -0,0 +1,78 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    CONC_LIST \
+    ISL \
+    OSL \
+    IMAGE \
+    SPEC_DECODING \
+    MODEL_PATH \
+    PREFILL_NUM_WORKERS \
+    PREFILL_TP \
+    PREFILL_EP \
+    PREFILL_DP_ATTN \
+    DECODE_NUM_WORKERS \
+    DECODE_TP \
+    DECODE_EP \
+    DECODE_DP_ATTN \
+    PREFILL_NODES \
+    DECODE_NODES \
+    RANDOM_RANGE_RATIO \
+    FRAMEWORK
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+set -x
+
+cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
+
+export TIME_LIMIT="08:00:00"
+export MODEL_PATH=$MODEL_PATH
+export MODEL_NAME=$MODEL_NAME
+export CONTAINER_IMAGE=$IMAGE
+
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+    export PREFILL_ENABLE_EP=false
+else
+    export PREFILL_ENABLE_EP=true
+fi
+
+if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
+    export PREFILL_ENABLE_DP=true
+else
+    export PREFILL_ENABLE_DP=false
+fi
+
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+    export DECODE_ENABLE_EP=false
+else
+    export DECODE_ENABLE_EP=true
+fi
+
+if [[ "$DECODE_DP_ATTN" == "true" ]]; then
+    export DECODE_ENABLE_DP=true
+else
+    export DECODE_ENABLE_DP=false
+fi
+
+JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
+    $PREFILL_NUM_WORKERS \
+    $DECODE_NODES \
+    $DECODE_NUM_WORKERS \
+    $ISL $OSL "${CONC_LIST// /x}" inf \
+    ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
+    ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
+    ${RANDOM_RANGE_RATIO} \
+    "${NODELIST:-}")
+
+if [[ $? -ne 0 ]]; then
+    echo "Failed to submit job" >&2
+    exit 1
+fi
+
+echo "$JOB_ID"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index eb47ba6ae..3b3735b1a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4014,3 +4014,15 @@
     - "1k/1k: 1p1d-dep4-dep8 (conc 4096,12288), 1p4d-dep4-tp8 (conc 4-128), 1p1d-dep4-dep16 (conc 4096,6144)"
     - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm-disagg
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3) — 1 prefill (TP8) + 1 decode (TP8) across conc 1,2,4,8,16, validating the MoRI-IO KV-transfer disagg pipeline end-to-end for M3"
+    - "Layered on the MoRI-IO patch-removal infra (#1585): uses benchmarks/multi_node/amd_utils with the runtime MoRI patches removed"
+    - "Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128 (MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, minimax_m3 parsers; no EP (TP8, MoE experts TP-sharded)"
+    - "M3 disagg script points MODEL_PATH at the cluster's shared HF cache (/it-share/hf-hub-cache) where the ~414 GB MiniMax-M3-MXFP8 checkpoint is pre-staged, instead of the launcher default /it-share/data; scoped to M3 only (other disagg models keep /it-share/data)"
+    - "Sweeps conc 1,2,4,8,16,32,64,128,256,512,1024 at both 1k1k and 8k1k (1P TP8 + 1D TP8). The 8k1k point makes the multi-node eval policy (8k1k + conc >= 16) mark one lm-eval on the highest-max-conc layout (eval-conc=median), validating the disagg pipeline's correctness; run with non-canary-full-sweep-enabled so the eval entry actually runs"
+    - "Adds two asymmetric prefill/decode layouts at both 1k1k and 8k1k alongside the TP8+TP8 sweep: 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1,2,4,8,16,32,64,128,256; and balanced 1P TP4 + 1D TP4 at conc 64,128,256,512,1024. Per-worker TP comes from the master-config prefill/decode tp (server_vllm.sh rewrites the models_vllm.yaml --tensor-parallel-size placeholder); no EP, dp-attn off, PREFILL_NODES=1/DECODE_NODES=1 (TP4 uses half an 8-GPU node)"
+    - "Adds a 2P TP4 + 1D TP8 layout at both 1k1k and 8k1k for high conc 256,512,768,1024: two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an 8-GPU node) feeding one TP8 decode (DECODE_NODES=1); 3 nodes total"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762

From aaadc7b8d6f19f5aafcb45d37cfcefe52c8df062 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 14 Jun 2026 21:42:36 -0400
Subject: [PATCH 02/20] amd_utils/job.slurm: auto-download disagg checkpoint
 when not pre-staged

The first MI355X disagg sweep (run 27515119215) failed: the day-zero
MiniMax-M3-MXFP8 checkpoint is not staged on the disagg cluster's shared FS, so
job.slurm's model search hit a hard FATAL ("Model 'MiniMax-M3-MXFP8' not found.
Searched: ...") before the engine ever started. The single-node recipes
hf-download inside the serving container, but the disagg path historically
required ops to pre-stage checkpoints.

Add an on-demand fallback to the vllm-disagg model-resolution block: when the
checkpoint isn't found, derive the HF repo id from the hf_dir (models--org--name
-> org/name) and download into MODEL_DIR in HF cache layout, then resolve the
snapshot as MODEL_PATH. Staging into MODEL_DIR keeps MODEL_PATH under the dir
that is bind-mounted into the serving container as /models, so the existing
-v ${MODEL_DIR}:/models mount and DOCKER_MODEL_PATH (/models) remap both resolve.

Implementation notes:
  - The host has no hf CLI, so the download runs in a one-shot container of the
    serving image (DOCKER_IMAGE_NAME), which ships huggingface_hub.
  - flock on a lockfile in MODEL_DIR serializes the prefill/decode nodes; a
    re-check of snapshots/ under the lock makes it idempotent (resumable).
  - hf download with a huggingface-cli fallback; 3 retries; HF_TOKEN passed
    through for gated repos.
  - Scoped to the vllm-disagg branch only; pre-staged models never reach this
    path (the search finds them first), so sglang/atom and existing vLLM disagg
    models (M2.5/Kimi) are unaffected.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm | 54 +++++++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 67160c262..bbbaa8ef4 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -165,9 +165,57 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
     done
 
     if [[ -z "$MODEL_PATH" ]]; then
-        echo "FATAL: Model '$MODEL_NAME' not found. Searched:"
-        for p in "${SEARCH_PATHS[@]}"; do echo "  - $p"; done
-        exit 1
+        # Not pre-staged. Unlike the single-node recipes (which hf-download inside
+        # the serving container), the disagg path historically required ops to
+        # pre-stage checkpoints, so day-zero models (e.g. MiniMax-M3) FATAL here.
+        # Auto-stage on demand into MODEL_DIR — the dir mounted into the serving
+        # container as /models — so the resolved MODEL_PATH stays under MODEL_DIR
+        # and both the `-v ${MODEL_DIR}:/models` mount and the DOCKER_MODEL_PATH
+        # (/models) remap resolve. Pre-staged models never reach this branch.
+        # The host has no hf CLI, so the download runs in a one-shot container of
+        # the serving image (which ships huggingface_hub). A flock serializes the
+        # prefill/decode nodes; the re-check under the lock makes it idempotent.
+        repo_id="$DISK_DIR_NAME"
+        if [[ "$repo_id" == models--* ]]; then
+            repo_id="${repo_id#models--}"; repo_id="${repo_id/--//}"
+        fi
+        if [[ "$repo_id" != */* ]]; then
+            echo "FATAL: Model '$MODEL_NAME' not found and cannot derive an HF repo"
+            echo "       id from hf_dir '$DISK_DIR_NAME' to auto-download. Searched:"
+            for p in "${SEARCH_PATHS[@]}"; do echo "  - $p"; done
+            exit 1
+        fi
+        if ! mkdir -p "$MODEL_DIR" 2>/dev/null || [[ ! -w "$MODEL_DIR" ]]; then
+            echo "FATAL: Model '$MODEL_NAME' not pre-staged and MODEL_DIR"
+            echo "       '$MODEL_DIR' is not writable for auto-download."
+            exit 1
+        fi
+        echo "Model '$MODEL_NAME' not pre-staged; auto-downloading '$repo_id' into $MODEL_DIR (HF cache layout)"
+        if docker ps >/dev/null 2>&1; then DK=docker; else DK="sudo docker"; fi
+        (
+            exec 9>"${MODEL_DIR}/.stage-${DISK_DIR_NAME}.lock"
+            flock -w 10800 9 || { echo "FATAL: timed out waiting for model-stage lock"; exit 1; }
+            if [[ ! -d "${MODEL_DIR}/${DISK_DIR_NAME}/snapshots" ]]; then
+                for attempt in 1 2 3; do
+                    $DK run --rm --network host \
+                        -v "${MODEL_DIR}:${MODEL_DIR}" \
+                        -e HF_HUB_CACHE="${MODEL_DIR}" \
+                        -e HF_TOKEN="${HF_TOKEN:-}" \
+                        "$DOCKER_IMAGE_NAME" \
+                        bash -lc "hf download '$repo_id' || huggingface-cli download '$repo_id'" && break
+                    [[ $attempt == 3 ]] && { echo "FATAL: hf download failed after $attempt attempts"; exit 1; }
+                    echo "hf download attempt $attempt failed; retrying in 60s"; sleep 60
+                done
+            fi
+        ) || exit 1
+        RESOLVED=$(resolve_hf_cache_path "${MODEL_DIR}/${DISK_DIR_NAME}")
+        if [[ -d "$RESOLVED" ]]; then
+            MODEL_PATH="$RESOLVED"
+            echo "Auto-staged MODEL_PATH: $MODEL_PATH"
+        else
+            echo "FATAL: '$MODEL_NAME' still not found after auto-download at ${MODEL_DIR}/${DISK_DIR_NAME}"
+            exit 1
+        fi
     fi
     echo "Final MODEL_PATH: $MODEL_PATH"
 else

From 7bfdc822bd093721fa4bbafd2d73ffb2524e5042 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 14 Jun 2026 22:36:56 -0400
Subject: [PATCH 03/20] job.slurm: --entrypoint "" for the auto-download
 container

The disagg auto-download reached hf download but failed all 3 attempts: the
one-shot `docker run "$DOCKER_IMAGE_NAME" bash -lc "hf download ..."` did not
override the image ENTRYPOINT, so the vllm-openai API server ran with the bash
command as its args and died with "Failed to infer device type" (no GPU mounted
in the download container). Add --entrypoint "" (as the serving container does)
so bash actually runs hf download.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index bbbaa8ef4..2eb0d7294 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -197,7 +197,11 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
             flock -w 10800 9 || { echo "FATAL: timed out waiting for model-stage lock"; exit 1; }
             if [[ ! -d "${MODEL_DIR}/${DISK_DIR_NAME}/snapshots" ]]; then
                 for attempt in 1 2 3; do
+                    # --entrypoint "" so bash runs hf download; the vllm-openai
+                    # image's default entrypoint is the API server, which would
+                    # otherwise try (and fail) to infer a GPU device here.
                     $DK run --rm --network host \
+                        --entrypoint "" \
                         -v "${MODEL_DIR}:${MODEL_DIR}" \
                         -e HF_HUB_CACHE="${MODEL_DIR}" \
                         -e HF_TOKEN="${HF_TOKEN:-}" \

From 44c6547447e953f3180af2e2c6e9a9f59c2908ce Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Sun, 14 Jun 2026 22:53:29 -0400
Subject: [PATCH 04/20] M3 disagg: use shared HF cache
 (/it-share/hf-hub-cache); drop auto-download

Per maintainer direction, point the MiniMax-M3 disagg model dir at the cluster's
shared HF cache where the ~414 GB MXFP8 checkpoint is already staged
(/it-share/hf-hub-cache/models--MiniMaxAI--MiniMax-M3-MXFP8), instead of the
launcher default /it-share/data. Scoped to M3 only via the M3 disagg script:

    export MODEL_PATH=/it-share/hf-hub-cache

submit.sh exports MODEL_DIR=$MODEL_PATH and job.slurm resolves the snapshot
under it (search path #1) and bind-mounts MODEL_DIR into the prefill/decode
serving containers. Other disagg models keep /it-share/data.

This supersedes the earlier job.slurm auto-download approach, which is reverted:
job.slurm now differs from main only by the #1585 mori-removal hunks (router
image bump + dropping VLLM_MORIIO_CONNECTOR_READ_MODE).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm     | 58 +------------------
 .../minimaxm3_fp8_mi355x_vllm-disagg.sh       |  7 ++-
 2 files changed, 9 insertions(+), 56 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 2eb0d7294..67160c262 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -165,61 +165,9 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
     done
 
     if [[ -z "$MODEL_PATH" ]]; then
-        # Not pre-staged. Unlike the single-node recipes (which hf-download inside
-        # the serving container), the disagg path historically required ops to
-        # pre-stage checkpoints, so day-zero models (e.g. MiniMax-M3) FATAL here.
-        # Auto-stage on demand into MODEL_DIR — the dir mounted into the serving
-        # container as /models — so the resolved MODEL_PATH stays under MODEL_DIR
-        # and both the `-v ${MODEL_DIR}:/models` mount and the DOCKER_MODEL_PATH
-        # (/models) remap resolve. Pre-staged models never reach this branch.
-        # The host has no hf CLI, so the download runs in a one-shot container of
-        # the serving image (which ships huggingface_hub). A flock serializes the
-        # prefill/decode nodes; the re-check under the lock makes it idempotent.
-        repo_id="$DISK_DIR_NAME"
-        if [[ "$repo_id" == models--* ]]; then
-            repo_id="${repo_id#models--}"; repo_id="${repo_id/--//}"
-        fi
-        if [[ "$repo_id" != */* ]]; then
-            echo "FATAL: Model '$MODEL_NAME' not found and cannot derive an HF repo"
-            echo "       id from hf_dir '$DISK_DIR_NAME' to auto-download. Searched:"
-            for p in "${SEARCH_PATHS[@]}"; do echo "  - $p"; done
-            exit 1
-        fi
-        if ! mkdir -p "$MODEL_DIR" 2>/dev/null || [[ ! -w "$MODEL_DIR" ]]; then
-            echo "FATAL: Model '$MODEL_NAME' not pre-staged and MODEL_DIR"
-            echo "       '$MODEL_DIR' is not writable for auto-download."
-            exit 1
-        fi
-        echo "Model '$MODEL_NAME' not pre-staged; auto-downloading '$repo_id' into $MODEL_DIR (HF cache layout)"
-        if docker ps >/dev/null 2>&1; then DK=docker; else DK="sudo docker"; fi
-        (
-            exec 9>"${MODEL_DIR}/.stage-${DISK_DIR_NAME}.lock"
-            flock -w 10800 9 || { echo "FATAL: timed out waiting for model-stage lock"; exit 1; }
-            if [[ ! -d "${MODEL_DIR}/${DISK_DIR_NAME}/snapshots" ]]; then
-                for attempt in 1 2 3; do
-                    # --entrypoint "" so bash runs hf download; the vllm-openai
-                    # image's default entrypoint is the API server, which would
-                    # otherwise try (and fail) to infer a GPU device here.
-                    $DK run --rm --network host \
-                        --entrypoint "" \
-                        -v "${MODEL_DIR}:${MODEL_DIR}" \
-                        -e HF_HUB_CACHE="${MODEL_DIR}" \
-                        -e HF_TOKEN="${HF_TOKEN:-}" \
-                        "$DOCKER_IMAGE_NAME" \
-                        bash -lc "hf download '$repo_id' || huggingface-cli download '$repo_id'" && break
-                    [[ $attempt == 3 ]] && { echo "FATAL: hf download failed after $attempt attempts"; exit 1; }
-                    echo "hf download attempt $attempt failed; retrying in 60s"; sleep 60
-                done
-            fi
-        ) || exit 1
-        RESOLVED=$(resolve_hf_cache_path "${MODEL_DIR}/${DISK_DIR_NAME}")
-        if [[ -d "$RESOLVED" ]]; then
-            MODEL_PATH="$RESOLVED"
-            echo "Auto-staged MODEL_PATH: $MODEL_PATH"
-        else
-            echo "FATAL: '$MODEL_NAME' still not found after auto-download at ${MODEL_DIR}/${DISK_DIR_NAME}"
-            exit 1
-        fi
+        echo "FATAL: Model '$MODEL_NAME' not found. Searched:"
+        for p in "${SEARCH_PATHS[@]}"; do echo "  - $p"; done
+        exit 1
     fi
     echo "Final MODEL_PATH: $MODEL_PATH"
 else
diff --git a/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh
index a9a28d889..f54940e29 100644
--- a/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh
+++ b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh
@@ -31,7 +31,12 @@ set -x
 cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
 
 export TIME_LIMIT="08:00:00"
-export MODEL_PATH=$MODEL_PATH
+# MiniMax-M3 MXFP8 (~414 GB) is pre-staged in this cluster's shared HF cache
+# (/it-share/hf-hub-cache/models--MiniMaxAI--MiniMax-M3-MXFP8), not the default
+# /it-share/data the launcher sets. Point the disagg model dir there for M3 only;
+# submit.sh exports MODEL_DIR=$MODEL_PATH and job.slurm resolves the snapshot under
+# it and bind-mounts MODEL_DIR into the prefill/decode serving containers.
+export MODEL_PATH=/it-share/hf-hub-cache
 export MODEL_NAME=$MODEL_NAME
 export CONTAINER_IMAGE=$IMAGE
 

From 718444cb7139c9133c044677d64382faa00394db Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Mon, 15 Jun 2026 01:25:02 -0400
Subject: [PATCH 05/20] disagg #1762: add 8k1k conc-16 row to run an lm-eval
 (validate correctness)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The conc-1 1k1k smoke test never triggered an eval — the multi-node eval policy
only marks 8k1k entries with conc >= MIN_EVAL_CONC (16). Add an 8k1k conc-16 row
(same 1P TP8 + 1D TP8 layout) so mark_eval_entries marks it run-eval=true
(eval-conc=16), running lm-eval through the MoRI-IO disagg pipeline to validate
correctness. The conc-1 1k1k row stays the latency smoke test.

Run with non-canary-full-sweep-enabled so the (non-min-conc) eval entry runs.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d7433e4d1..145897f36 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2766,3 +2766,26 @@ minimaxm3-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+    # 8k1k conc-16 row (same 1P TP8 + 1D TP8 layout) exists so the multi-node
+    # eval policy (8k1k + conc >= MIN_EVAL_CONC=16) marks an lm-eval — validates
+    # the M3 MoRI-IO disagg pipeline's correctness end-to-end. The conc-1 1k1k
+    # row above stays the latency smoke test.
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 16 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"

From 84c8d8ecdf1deb104513e95c06c556ab5a421509 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Mon, 15 Jun 2026 01:28:42 -0400
Subject: [PATCH 06/20] disagg #1762: sweep conc 1,2,4,8,16 (not just conc 1)

Widen the 1k1k disagg latency/throughput sweep from conc 1 to conc 1,2,4,8,16
(1P TP8 + 1D TP8). The 8k1k conc-16 eval row is unchanged.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 145897f36..6286c3766 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2751,7 +2751,7 @@ minimaxm3-fp8-mi355x-vllm-disagg:
       osl: 1024
       search-space:
       - spec-decoding: "none"
-        conc-list: [ 1 ]
+        conc-list: [ 1, 2, 4, 8, 16 ]
         prefill:
           num-worker: 1
           tp: 8

From c9a10e081d1d880a25e251f5fc98853fbe76f327 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Mon, 15 Jun 2026 01:29:28 -0400
Subject: [PATCH 07/20] disagg #1762: sweep conc 1,2,4,8,16 at both 1k1k and
 8k1k

Widen the disagg sweep from conc 1 to conc 1,2,4,8,16 for both seq-len scenarios
(1P TP8 + 1D TP8). The 8k1k conc-16 point keeps the multi-node eval marked
(eval-conc=16) so lm-eval still validates the MoRI-IO disagg pipeline.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 6286c3766..ad879f894 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2766,15 +2766,15 @@ minimaxm3-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
-    # 8k1k conc-16 row (same 1P TP8 + 1D TP8 layout) exists so the multi-node
-    # eval policy (8k1k + conc >= MIN_EVAL_CONC=16) marks an lm-eval — validates
-    # the M3 MoRI-IO disagg pipeline's correctness end-to-end. The conc-1 1k1k
-    # row above stays the latency smoke test.
+    # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc 1,2,4,8,16. The
+    # conc-16 point also makes the multi-node eval policy (8k1k + conc >= 16) mark
+    # an lm-eval (eval-conc=16) — validating the M3 MoRI-IO disagg pipeline's
+    # correctness end-to-end.
     - isl: 8192
       osl: 1024
       search-space:
       - spec-decoding: "none"
-        conc-list: [ 16 ]
+        conc-list: [ 1, 2, 4, 8, 16 ]
         prefill:
           num-worker: 1
           tp: 8

From 299c401029692e371e9aba2fac4eaf3ca7d408b1 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Wed, 17 Jun 2026 16:16:37 +0000
Subject: [PATCH 08/20] Update the vLLM external router container

vllm/vllm-router only retains ~16 recent nightlies on Docker Hub; older
dated tags are garbage-collected (manifest unknown), which makes `docker run`
fail with exit 125 on any node that has not already cached the image.
---
 benchmarks/multi_node/amd_utils/job.slurm | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 67160c262..71503f228 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -315,8 +315,10 @@ export IS_MULTINODE="${IS_MULTINODE:-false}"
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
 export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 
-# vLLM external router container
-VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260603-e667ebb}"
+# vLLM external router container.
+# NOTE: vllm/vllm-router only retains ~16 recent nightlies on Docker Hub; older
+# dated tags are garbage-collected (manifest unknown)
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260617-e667ebb}"
 ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"
 

From 08be1aacb6d3b7274b57ed4e84a08f8c1f154320 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Thu, 18 Jun 2026 05:20:43 +0000
Subject: [PATCH 09/20] M3 disagg: per-layer MoRIIO KV transfer for hybrid
 sparse-attn (partial)

MiniMax-M3 (MiniMaxM3SparseForCausalLM) is a hybrid sparse-attention model:
sparse layers register a separate lightning-indexer cache (MLAAttentionSpec,
rank-3, bf16, key-only) alongside the main cache (FullAttentionSpec, rank-5,
fp8, K+V). The MoRIIO connector assumes one uniform KV layout -- it derives
block geometry from the first cache and reuses first_layer's offsets for every
layer (see its own "hybrid attn" TODO) -- so the bf16 key-only index cache is
transferred with fp8 K+V sizing and gets corrupted on the decode worker,
producing garbage output (disagg gsm8k ~= 0 while single-node M3 is correct).
This is the vLLM analogue of the SGLang MoRI DSA-state bug in patches/mori_conn.py.

- patches/moriio_heterogeneous_kv.py: compute the READ-path transfer geometry
  per layer (own shape/stride/dtype/rank) instead of from the first cache.
  Idempotent; no-op for homogeneous models.
- setup_deps.sh: apply it on the vllm-disagg path.

NOTE: partial fix -- necessary but not yet sufficient. The index cache is also a
separate KV-cache group whose block-table/num_blocks the single-namespace MoRIIO
connector cannot map, so M3 disagg accuracy is still broken pending a larger
multi-group / index-state transfer change. (Disabling sparse attention is not a
viable workaround: M3's fused QKV carries index_k weights, so dropping the
indexer breaks weight load.)

Refs #1762

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../patches/moriio_heterogeneous_kv.py        | 145 ++++++++++++++++++
 benchmarks/multi_node/amd_utils/setup_deps.sh |  23 +++
 2 files changed, 168 insertions(+)
 create mode 100644 benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py

diff --git a/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py b/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py
new file mode 100644
index 000000000..a7ee8c724
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""Patch vLLM's MoRIIOConnector to transfer heterogeneous KV caches per-layer.
+
+Why
+---
+MiniMax-M3 (MiniMaxM3SparseForCausalLM) is a hybrid sparse-attention model:
+
+  * main attention layers register a ``FullAttentionSpec`` KV cache:
+      rank-5 ``[2, num_blocks, block_size, num_kv_heads, head_dim]``, **fp8**, K+V
+  * the lightning indexer (sparse layers) registers a separate
+    ``MLAAttentionSpec`` index cache (``MiniMaxM3IndexerCache``):
+      rank-3 ``[num_blocks, block_size, head_dim]``, **bf16**, key-only
+
+The upstream MoRIIOConnector assumes a *single uniform* KV layout: it derives
+``self.kv_cache_shape`` / ``block_len`` / ``element_size`` from the **first**
+cache, and ``_read_blocks`` computes the transfer offsets **once** from
+``first_layer`` and reuses them for **every** layer (see the in-code TODO
+"block_len needs to be per-layer for ... hybrid attn"). For M3 this transfers
+the bf16 key-only rank-3 index cache using the fp8 K+V rank-5 main-cache sizing,
+corrupting the indexer state on the decode worker. The sparse layers then select
+the wrong KV blocks and the model emits incoherent tokens (gsm8k ~= 0).
+
+This is the vLLM analogue of the already-shipped SGLang MoRI DSA fix in
+``patches/mori_conn.py`` (see patches/README.md).
+
+Fix
+---
+Compute transfer geometry **per layer** from each layer's own tensor
+(``shape`` / ``stride`` / ``element_size`` / rank), instead of from the first
+cache. For homogeneous models every layer's geometry equals the first cache's,
+so behaviour is unchanged; only hybrid models (M3) are affected.
+
+Two minimal, targeted edits (READ path, which the M3 recipe uses with
+``read_mode: true``):
+
+  1. ``_compute_block_transfer_offsets`` -> use ``self.kv_caches[layer_name]``'s
+     own shape (rank/dims) instead of the global ``self.kv_cache_shape``.
+  2. ``_read_blocks`` -> call ``_compute_block_transfer_offsets`` inside the
+     per-layer loop instead of once for ``first_layer``.
+
+Idempotent: re-running detects the ``PATCHED heterogeneous-kv`` marker and exits.
+"""
+import os
+import sys
+
+
+def _default_target() -> str:
+    try:
+        import vllm
+    except Exception:
+        return ""
+    return os.path.join(
+        os.path.dirname(vllm.__file__),
+        "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py",
+    )
+
+
+OLD1 = '''        assert self.kv_cache_shape is not None, "KV caches shape not initialized"
+        is_mla = len(self.kv_cache_shape) == 3
+        stride = self.kv_caches[layer_name].stride()
+        sz = self.kv_caches[layer_name].element_size()
+        if is_mla:
+            blknum, blksize, hs = self.kv_cache_shape
+            hn = 1
+            block_stride = stride[0]
+        else:
+            _, blknum, blksize, hn, hs = self.kv_cache_shape'''
+
+NEW1 = '''        # [PATCHED heterogeneous-kv] Use this layer's own shape so caches with a
+        # different rank/dtype (MiniMax-M3: bf16 key-only rank-3 index cache vs
+        # fp8 K+V rank-5 main cache) are sized per-layer, not from the first cache.
+        layer_shape = tuple(self.kv_caches[layer_name].shape)
+        assert layer_shape, "KV caches shape not initialized"
+        is_mla = len(layer_shape) == 3
+        stride = self.kv_caches[layer_name].stride()
+        sz = self.kv_caches[layer_name].element_size()
+        if is_mla:
+            blknum, blksize, hs = layer_shape
+            hn = 1
+            block_stride = stride[0]
+        else:
+            _, blknum, blksize, hn, hs = layer_shape'''
+
+OLD2 = '''        first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0]
+        offs = self._compute_block_transfer_offsets(
+            first_layer, local_block_ids, remote_block_ids, remote_moriio_meta
+        )
+
+        for layer_name in self.layer_name_to_local_kv_cache_metadata:
+            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
+                layer_name
+            )
+            # TODO : apply multi-session batch-read when moriio support it
+            transfer_status = self.moriio_wrapper.read_remote_data(
+                offs[2], offs[0], offs[1], sessions[sess_idx]
+            )'''
+
+NEW2 = '''        for layer_name in self.layer_name_to_local_kv_cache_metadata:
+            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
+                layer_name
+            )
+            # [PATCHED heterogeneous-kv] Per-layer offsets so the bf16 key-only
+            # MiniMax-M3 index cache is transferred with its own geometry instead
+            # of the first (main fp8 K+V) layer's.
+            offs = self._compute_block_transfer_offsets(
+                layer_name, local_block_ids, remote_block_ids, remote_moriio_meta
+            )
+            # TODO : apply multi-session batch-read when moriio support it
+            transfer_status = self.moriio_wrapper.read_remote_data(
+                offs[2], offs[0], offs[1], sessions[sess_idx]
+            )'''
+
+
+def main() -> int:
+    target = sys.argv[1] if len(sys.argv) > 1 else _default_target()
+    if not target or not os.path.isfile(target):
+        print(f"[PATCH] moriio_connector.py not found ({target!r}); skipping")
+        return 0
+    src = open(target).read()
+    if "PATCHED heterogeneous-kv" in src:
+        print("[PATCH] moriio heterogeneous-kv already applied")
+        return 0
+    if OLD1 not in src:
+        print("[PATCH] WARN: _compute_block_transfer_offsets pattern not found; "
+              "connector version changed — skipping (no-op)")
+        return 0
+    if OLD2 not in src:
+        print("[PATCH] WARN: _read_blocks pattern not found; "
+              "connector version changed — skipping (no-op)")
+        return 0
+    src = src.replace(OLD1, NEW1, 1).replace(OLD2, NEW2, 1)
+    # Validate it still compiles before writing.
+    try:
+        compile(src, target, "exec")
+    except SyntaxError as e:
+        print(f"[PATCH] ERROR: patched source fails to compile: {e}")
+        return 1
+    open(target, "w").write(src)
+    print("[PATCH] Applied: moriio heterogeneous-kv per-layer transfer "
+          "(MiniMax-M3 sparse index cache)")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
index 35eaf17dc..3e5d82c0c 100644
--- a/benchmarks/multi_node/amd_utils/setup_deps.sh
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -185,6 +185,28 @@ install_transformers_glm5() {
     _SETUP_INSTALLED+=("transformers-glm5")
 }
 
+# ---------------------------------------------------------------------------
+# vLLM: Patch MoRIIOConnector for heterogeneous (hybrid sparse-attn) KV caches.
+#
+# MiniMax-M3 registers a bf16 key-only rank-3 lightning-indexer cache alongside
+# the fp8 K+V rank-5 main cache. Upstream MoRIIO derives one uniform block
+# geometry from the first cache and reuses the first layer's transfer offsets
+# for every layer, corrupting the index cache on the decode worker -> garbage
+# output (gsm8k ~= 0). The overlay makes the READ path compute geometry/offsets
+# per layer. Idempotent; no-op on connector versions that don't match.
+# See patches/moriio_heterogeneous_kv.py and patches/README.md.
+# ---------------------------------------------------------------------------
+patch_moriio_heterogeneous_kv() {
+    local patcher
+    patcher="$(dirname "${BASH_SOURCE[0]}")/patches/moriio_heterogeneous_kv.py"
+    if [[ ! -f "$patcher" ]]; then
+        echo "[SETUP] moriio heterogeneous-kv patcher not found, skipping"
+        return 0
+    fi
+    python3 "$patcher" || echo "[SETUP] WARN: moriio heterogeneous-kv patch returned non-zero"
+    _SETUP_INSTALLED+=("moriio-heterogeneous-kv")
+}
+
 # =============================================================================
 # Run installers (engine-gated)
 # =============================================================================
@@ -192,6 +214,7 @@ install_transformers_glm5() {
 if [[ "$ENGINE" == "vllm-disagg" ]]; then
     install_recipe_deps
     install_amd_quark
+    patch_moriio_heterogeneous_kv
 
     # =========================================================================
     # vLLM: Export UCX/RIXL paths (persists since this file is sourced)

From 005e16b483493b9c850fcb74da780797f9108a74 Mon Sep 17 00:00:00 2001
From: TianDi101 <ditian12@amd.com>
Date: Thu, 18 Jun 2026 07:31:48 +0000
Subject: [PATCH 10/20] feat(amd-disagg): add vLLM MoRIIO KV-layout patch to
 reuse stock minimax-m3 image

The vLLM MoRIIOConnector in vllm/vllm-openai-rocm:minimax-m3 assumes the
FlashAttention KV layout [2, num_blocks, ...] (K/V axis outer) but this
vLLM's backends allocate [num_blocks, 2, ...] (K/V axis inner), so every
disagg block transfer reads the wrong region. Invisible to throughput,
but corrupts GQA/non-MLA accuracy (MiniMax-M3 gsm8k 0.0008 -> 0.957).

Instead of baking a fix into a rebuilt image (-hetkv) or carrying full
vendored copies of the patched files in-tree, carry just the 218-line
unified diff (patches/moriio/moriio-kv-layout-fix.diff) and apply it with
`patch -p1` against the vLLM package dir inside the container at startup,
ahead of the server launch. The repo is already bind-mounted into the
container, so no EXTRA_DOCKER_MOUNTS wiring is needed -- job.slurm
auto-applies the diff when DOCKER_IMAGE_NAME contains "minimax-m3"
(skippable with MORIIO_KV_PATCH=skip), mirroring the existing
mori_conn.py sglang hook. A failed apply aborts the container instead of
silently running unpatched.

Validated on a manual 2-node run (n06-21 prefill+router / n09-21 decode)
using the STOCK image: gsm8k strict-match 0.9568 / flexible-extract
0.9560 (matches the baked image within noise), decode probe healthy.

- patches/moriio/moriio-kv-layout-fix.diff: unified diff vs stock
- job.slurm: in-container `patch` step, MORIIO_KV_PATCH=skip opt-out
- patches/README.md: document the moriio/ diff-apply mechanism

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm     |  27 +++
 .../multi_node/amd_utils/patches/README.md    |  90 +++++++-
 .../patches/moriio/moriio-kv-layout-fix.diff  | 218 ++++++++++++++++++
 3 files changed, 324 insertions(+), 11 deletions(-)
 create mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 71503f228..727f64632 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -81,6 +81,32 @@ if [[ "${MORI_CONN_PATCH:-auto}" != "skip" ]] \
     echo "[job.slurm] auto-applied MoRI conn.py overlay: ${_MORI_PATCH_FILE}"
 fi
 
+# ── In-tree vLLM MoRIIO patch: auto-apply for known-affected images ──
+# The vLLM MoRIIOConnector (image vllm/vllm-openai-rocm:minimax-m3) ships a
+# transposed-KV-layout bug: it assumes the FlashAttention layout
+# [2, num_blocks, ...] (K/V axis outer) but this vLLM's backends allocate
+# [num_blocks, 2, ...] (K/V axis inner), so every disagg block transfer reads
+# the wrong region. Invisible to throughput, but corrupts GQA/non-MLA accuracy
+# (MiniMax-M3 gsm8k 0.0008 -> 0.958). Fix ships as a unified diff (see
+# patches/moriio/ and patches/README.md), applied to the vLLM package dir
+# inside the container at startup, ahead of the server launch below.
+#
+# Auto-applied when the image tag contains "minimax-m3" (and not the already-
+# fixed "-hetkv" rebuild), unless the caller sets MORIIO_KV_PATCH=skip. The
+# repo is already bind-mounted at DOCKER_MOUNT_PATH ("/workspace"), so the
+# diff needs no extra mount -- just an in-container `patch` call. A failed
+# apply aborts the container: silently running unpatched would silently
+# corrupt accuracy, not just skip a feature.
+_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff"
+_MORIIO_PATCH_CMD=""
+if [[ "${MORIIO_KV_PATCH:-auto}" != "skip" ]] \
+   && [[ -f "$_MORIIO_DIFF" ]] \
+   && [[ "${DOCKER_IMAGE_NAME:-}" == *"minimax-m3"* ]] \
+   && [[ "${DOCKER_IMAGE_NAME:-}" != *"hetkv"* ]]; then
+    _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff || exit 1"
+    echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout diff inside container: ${_MORIIO_DIFF}"
+fi
+
 xP="${xP:-1}"
 yD="${yD:-1}"
 
@@ -593,6 +619,7 @@ fi
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         set -o pipefail
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
+        '"${_MORIIO_PATCH_CMD:-}"'
         '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log
     '
 
diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md
index d9b5de79d..27f9fc81d 100644
--- a/benchmarks/multi_node/amd_utils/patches/README.md
+++ b/benchmarks/multi_node/amd_utils/patches/README.md
@@ -1,16 +1,25 @@
-# In-tree sglang patches for the MoRI PD-disagg path
+# In-tree patches for the MoRI / MoRIIO PD-disagg path
 
-This directory carries small Python overlays that get bind-mounted over
-the upstream sglang source inside the docker container at runtime.
-They are needed because some sglang releases ship known bugs in the
-MoRI disaggregation backend that block our benchmark + accuracy
-configs.
+This directory carries small overlays that fix up the engine source inside
+the docker container at runtime. They are needed because some published
+images ship known bugs in the (MoRI / MoRIIO) disaggregation backend that
+block our benchmark + accuracy configs — so we can keep reusing the
+**stock image** instead of rebuilding a patched one.
 
-The mount is wired through the `EXTRA_DOCKER_MOUNTS` env var that
-`job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after the
-existing `-v` block). The local-test driver scripts under
-`scripts/sglang_disagg/` pre-set this env var to the path of the
-relevant overlay; CI runners that need the patch can do the same.
+- `mori_conn.py` — single-file overlay (bind-mounted) for the **sglang**
+  MoRI backend.
+- `moriio/` — unified-diff overlay (applied with `patch` at container
+  startup) for the **vLLM** MoRIIO connector (`minimax-m3` image). See its
+  section below.
+
+The `mori_conn.py` overlay is wired through the `EXTRA_DOCKER_MOUNTS` env
+var that `job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after
+the existing `-v` block). The local-test driver scripts under
+`scripts/sglang_disagg/` pre-set this env var to the path of the relevant
+overlay; CI runners that need the patch can do the same. The `moriio/`
+diff needs no extra mount — the repo (and thus the diff file) is already
+bind-mounted into the container — `job.slurm` just runs `patch` against it
+before launching the server; see "How to enable" in its section below.
 
 ## `mori_conn.py`
 
@@ -73,6 +82,65 @@ When this env var is unset (CI default for runs that don't need the
 patch), `${EXTRA_DOCKER_MOUNTS:-}` expands to the empty string and
 container behavior is byte-identical to the unpatched path.
 
+## `moriio/` (vLLM MoRIIO connector, MiniMax-M3)
+
+A unified diff (`moriio-kv-layout-fix.diff`), applied with `patch -p1`
+against the vLLM package dir inside the container, touching three files:
+
+```
+/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/moriio/
+  ├── moriio_connector.py
+  ├── moriio_engine.py
+  └── moriio_common.py
+```
+
+Source: forked from the stock `vllm/vllm-openai-rocm:minimax-m3` image
+(vLLM `0.22.1rc1.dev490`).
+
+**Bug (general MoRIIO, not M3-specific):** the connector assumed the
+FlashAttention KV layout `[2, num_blocks, block_size, heads, head_dim]`
+(K/V axis **outer**), but this vLLM's attention backends (standard
+`TRITON_ATTN` **and** the M3 sparse backend) allocate
+`[num_blocks, 2, block_size, heads, head_dim]` (K/V axis **inner**).
+`_compute_block_transfer_offsets` indexed blocks with `stride[1]` (the
+K/V stride) instead of `stride[0]` (the block stride), so every disagg
+block transfer read the wrong region. Invisible to throughput
+benchmarks (they don't check output); only the **gsm8k accuracy eval**
+catches it. The connector was only ever correct for MLA models
+(DeepSeek, rank-3 path); MiniMax-M3 is GQA + sparse lightning-indexer
+→ broken (disagg gsm8k `0.0008` token salad).
+
+**Fix** — axis-aware offset computation: detect the block axis + optional
+size-2 K/V axis from each layer's real shape/stride, compute offsets per
+distinct geometry (handles M3's 2nd geometry, the rank-3 bf16 key-only
+indexer cache), `num_blocks = shape[0]`; the WRITE path memoizes offsets
+per geometry. Result: disagg gsm8k `strict-match 0.9583 /
+flexible-extract 0.9575` (matches single-node). Homogeneous models
+(uniform layout) are unaffected — one geometry, one offset set, same
+result. Heterogeneous-TP P/D (prefill TP ≠ decode TP) is still a TODO
+(same as upstream). Full write-up in
+`/apps/ditian12/m3_disagg_manual/moriio_hetkv_fix/README.md`.
+
+### How to enable
+
+`job.slurm` auto-applies this diff when `DOCKER_IMAGE_NAME` contains
+`minimax-m3` (and not the already-fixed `-hetkv` rebuild), unless the
+caller sets `MORIIO_KV_PATCH=skip`. To wire it by hand (e.g. the
+`m3_disagg_manual/run_manual_2node.sh` driver, which sets
+`MORIIO_KV_PATCH`), run inside the container before the server starts:
+
+```bash
+patch -p1 -d /usr/local/lib/python3.12/dist-packages \
+  < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff
+```
+
+(`$DI_REPO_DIR` is the InferenceX checkout root that `job.slurm` already
+mounts into the container at `/workspace`.)
+
+This lets the **stock** `minimax-m3` image be reused for the E2E
+accuracy run — no `-hetkv` rebuild needed. Retire the overlay once the
+fix lands in a published image; it is not yet upstreamed.
+
 ## When to use which patch
 
 | Image / version | Need `mori_conn.py` overlay? |
diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff
new file mode 100644
index 000000000..7f6c435bf
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff
@@ -0,0 +1,218 @@
+diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
+index 73694ce32..a30d30af8 100644
+--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
+@@ -80,6 +80,10 @@ class RemoteAllocInfo:
+     writes_done: int = 0
+     decode_dp_rank: int = 0
+     transfer_offset: tuple[list[int], list[int], list[int]] | None = None
++    # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for
++    # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a
++    # single entry. See MoRIIOWriter._prepare_transfer_plan.
++    transfer_offsets: dict = field(default_factory=dict)
+ 
+ 
+ class ROLE(Enum):
+diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
+index 167eef6e1..1846a3c21 100644
+--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
+@@ -1233,8 +1233,10 @@ class MoRIIOConnectorWorker:
+             block_size, kv_latent_dim = block_shape
+             self.slot_size_bytes = kv_elem_size * kv_latent_dim
+         else:
+-            # [2 (k and v), num_blocks, ...]
+-            self.num_blocks = first_kv_cache.shape[1]
++            # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V
++            # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read
++            # shape[1] here, which is the size-2 K/V axis, not num_blocks.)
++            self.num_blocks = first_kv_cache.shape[0]
+             block_rank = 3  # [block_size, kv_heads, head_dim]
+             block_shape = first_kv_cache.shape[-block_rank:]
+             block_size, n_kv_heads, head_dim = block_shape[-3:]
+@@ -1257,10 +1259,17 @@ class MoRIIOConnectorWorker:
+         caches_data = []
+ 
+         for cache_or_caches in kv_caches.values():
+-            cache_list = [cache_or_caches] if use_mla else cache_or_caches
++            # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs
++            # rank-5 (full attention, [K, V]). A single global use_mla flag
++            # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for
++            # hybrid models, so detect per cache. region_len is the actual tensor
++            # (or K/V half) byte size -- equivalent to num_blocks * block_len for
++            # homogeneous models, correct for heterogeneous ones.
++            cache_is_mla = cache_or_caches.dim() == 3
++            cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches
+             for cache in cache_list:
+                 base_addr = cache.data_ptr()
+-                region_len = self.num_blocks * self.block_len
++                region_len = cache.numel() * cache.element_size()
+                 caches_data.append((base_addr, region_len, cache.device.index, ""))
+                 kv_caches_base_addr.append(base_addr)
+ 
+@@ -1665,21 +1674,53 @@ class MoRIIOConnectorWorker:
+             Tuple of (local_offsets, remote_offsets, transfer_sizes)
+         """
+         assert self.kv_cache_shape is not None, "KV caches shape not initialized"
+-        is_mla = len(self.kv_cache_shape) == 3
+-        stride = self.kv_caches[layer_name].stride()
+-        sz = self.kv_caches[layer_name].element_size()
+-        if is_mla:
+-            blknum, blksize, hs = self.kv_cache_shape
+-            hn = 1
+-            block_stride = stride[0]
+-        else:
+-            _, blknum, blksize, hn, hs = self.kv_cache_shape
+-            local_ktov_stride = stride[0]
+-            block_stride = stride[1]
+-            remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks
++        # Per-layer, axis-aware geometry.
++        #
++        # The KV tensors vLLM hands the connector are laid out (verified on
++        # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1):
++        #   * main attention (GQA, dense + sparse layers):
++        #       shape (num_blocks, 2, block_size, num_kv_heads, head_dim)
++        #       -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0.
++        #   * sparse lightning-indexer (key-only, bf16):
++        #       shape (num_blocks, block_size, head_dim)  -- rank 3, no K/V axis.
++        #
++        # The legacy code assumed the FlashAttention-style [2, num_blocks, ...]
++        # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V
++        # stride) -- transposing block vs K/V so every block read the wrong
++        # region (corruption invisible to throughput-only benchmarks). Instead,
++        # detect the block axis (size == num_blocks) and the optional K/V axis
++        # (size 2) from THIS layer's own shape, and derive strides from them. The
++        # per-block stride is independent of num_blocks, so no remote-num_blocks
++        # scaling is needed (homogeneous P/D TP; heterogeneous TP still TODO).
++        layer_cache = self.kv_caches[layer_name]
++        layer_shape = tuple(layer_cache.shape)
++        stride = layer_cache.stride()
++        sz = layer_cache.element_size()
++        rank = len(layer_shape)
++
++        # K/V axis = the size-2 axis among the two outermost dims (if any).
++        kv_axis: int | None = None
++        if rank >= 4:
++            if layer_shape[0] == 2:
++                kv_axis = 0
++            elif layer_shape[1] == 2:
++                kv_axis = 1
++        # Block axis = outermost non-K/V axis (the one indexed by block_id).
++        block_axis = 0
++        if kv_axis == 0:
++            block_axis = 1
++        block_stride = stride[block_axis]
++        kv_stride = stride[kv_axis] if kv_axis is not None else 0
++        per_block = layer_shape[kv_axis] if kv_axis is not None else 1  # 2 (K,V) or 1
++
++        # One transferred slab = all dims except the block and K/V axes.
++        slot_elems = 1
++        for ax in range(rank):
++            if ax == block_axis or ax == kv_axis:
++                continue
++            slot_elems *= layer_shape[ax]
++        transfer_size_byte = slot_elems * sz
+ 
+-        transfer_size_byte = blksize * hn * hs * sz
+-        per_block = 1 if is_mla else 2
+         total = len(local_block_ids) * per_block
+         offset_local = [0] * total
+         offset_remote = [0] * total
+@@ -1688,17 +1729,9 @@ class MoRIIOConnectorWorker:
+         w = 0
+         for i, lb in enumerate(local_block_ids):
+             rb = remote_block_ids[i]
+-            # K
+-            offset_local[w] = sz * (lb * block_stride)
+-            offset_remote[w] = sz * (rb * block_stride)
+-            w += 1
+-            if not is_mla:
+-                # V
+-                # Handle num_block variations originating from PD (different kv strides)
+-                # TODO: address block_sz differences in heterogeneous TP scenarios
+-                # In MLA, we don't need to consider these two cases.
+-                offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride)
+-                offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride)
++            for kv in range(per_block):
++                offset_local[w] = sz * (lb * block_stride + kv * kv_stride)
++                offset_remote[w] = sz * (rb * block_stride + kv * kv_stride)
+                 w += 1
+ 
+         merged_l, merged_r, merged_s = self.merge_contiguous_blocks(
+@@ -1722,15 +1755,26 @@ class MoRIIOConnectorWorker:
+         dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0)
+         sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id)
+ 
+-        first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0]
+-        offs = self._compute_block_transfer_offsets(
+-            first_layer, local_block_ids, remote_block_ids, remote_moriio_meta
+-        )
+-
+-        for layer_name in self.layer_name_to_local_kv_cache_metadata:
+-            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
+-                layer_name
++        # Heterogeneous-KV models register layers with different shapes/dtypes in
++        # a single KV-cache group sharing one block table, so block_ids match
++        # across layers but per-block byte geometry does not. Compute offsets per
++        # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3
++        # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing.
++        layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys())
++        offs_by_geom: dict = {}
++        for sess_idx, layer_name in enumerate(layer_names):
++            layer_cache = self.kv_caches[layer_name]
++            geom_key = (
++                tuple(layer_cache.shape),
++                tuple(layer_cache.stride()),
++                layer_cache.dtype,
+             )
++            offs = offs_by_geom.get(geom_key)
++            if offs is None:
++                offs = self._compute_block_transfer_offsets(
++                    layer_name, local_block_ids, remote_block_ids, remote_moriio_meta
++                )
++                offs_by_geom[geom_key] = offs
+             # TODO : apply multi-session batch-read when moriio support it
+             transfer_status = self.moriio_wrapper.read_remote_data(
+                 offs[2], offs[0], offs[1], sessions[sess_idx]
+diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
+index 3ca5f37ca..113eccad0 100644
+--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
+@@ -279,21 +279,33 @@ class MoRIIOWriter:
+         Returns:
+             The transfer plan
+         """
+-        # Compute offsets if not cached
+-        if request_info.transfer_offset is None:
++        # Compute offsets per distinct layer geometry. Heterogeneous-KV models
++        # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8
++        # caches in one KV-cache group; caching a single offset set per request
++        # and reusing it for every layer corrupts the indexer cache. Block_ids
++        # are shared (single block table), so offsets depend only on the layer's
++        # shape/stride/dtype -- memoize by that geometry key.
++        layer_cache = self.worker.kv_caches[task.layer_name]
++        geom_key = (
++            tuple(layer_cache.shape),
++            tuple(layer_cache.stride()),
++            layer_cache.dtype,
++        )
++        offsets = request_info.transfer_offsets.get(geom_key)
++        if offsets is None:
+             offsets = self.worker._compute_block_transfer_offsets(
+                 task.layer_name,
+                 task.local_block_ids,
+                 request_info.block_ids,
+                 remote_moriio_meta,
+             )
+-            request_info.transfer_offset = offsets
++            request_info.transfer_offsets[geom_key] = offsets
+ 
+         # Get session index
+         layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys())
+         sess_idx = layer_names.index(task.layer_name)
+ 
+-        local_off, remote_off, sizes = request_info.transfer_offset
++        local_off, remote_off, sizes = offsets
+ 
+         return LayerTransferPlan(
+             request_id=task.request_id,

From c1b19e26dde2b777b2271e0f2378af8e7fb64d5b Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 18 Jun 2026 18:22:44 -0400
Subject: [PATCH 11/20] disagg #1762: extend conc sweep to
 32,64,128,256,512,1024 at 1k1k and 8k1k

Widen the disagg sweep from conc 1,2,4,8,16 to
1,2,4,8,16,32,64,128,256,512,1024 for both seq-len scenarios (1P TP8 + 1D
TP8). The 8k1k conc-16 point keeps the multi-node eval marked (eval-conc=16)
so lm-eval still validates the MoRI-IO disagg pipeline.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ad879f894..d419ad73f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2751,7 +2751,7 @@ minimaxm3-fp8-mi355x-vllm-disagg:
       osl: 1024
       search-space:
       - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16 ]
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
         prefill:
           num-worker: 1
           tp: 8
@@ -2766,15 +2766,15 @@ minimaxm3-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
-    # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc 1,2,4,8,16. The
-    # conc-16 point also makes the multi-node eval policy (8k1k + conc >= 16) mark
-    # an lm-eval (eval-conc=16) — validating the M3 MoRI-IO disagg pipeline's
-    # correctness end-to-end.
+    # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc
+    # 1,2,4,8,16,32,64,128,256,512,1024. The conc-16 point also makes the
+    # multi-node eval policy (8k1k + conc >= 16) mark an lm-eval (eval-conc=16) —
+    # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end.
     - isl: 8192
       osl: 1024
       search-space:
       - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16 ]
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
         prefill:
           num-worker: 1
           tp: 8

From d0a7844fbe34f7a54e0658e154a487b2d460d371 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Thu, 18 Jun 2026 21:25:13 -0400
Subject: [PATCH 12/20] disagg #1762: add TP4-prefill P/D layouts (TP4+TP8,
 TP4+TP4) at 1k1k and 8k1k

Add two asymmetric prefill/decode layouts alongside the existing TP8+TP8 sweep,
for both seq-len scenarios:
  - 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1..256
  - 1P TP4 + 1D TP4 (balanced half-node) at conc 64..1024

Per-worker TP is driven by the master-config prefill/decode tp: server_vllm.sh
sed-rewrites the models_vllm.yaml --tensor-parallel-size 8 placeholder to the
computed PREFILL_TP_SIZE/DECODE_TP_SIZE, so no models_vllm.yaml flag change is
needed (comment updated to say so). The multinode eval policy still marks exactly
one lm-eval (groups by dp-attn, not TP) on the TP8+TP8 8k1k layout.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               | 79 ++++++++++++++++++-
 .../multi_node/amd_utils/models_vllm.yaml     |  4 +-
 2 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d419ad73f..aa4887ad0 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2766,10 +2766,46 @@ minimaxm3-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
-    # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc
-    # 1,2,4,8,16,32,64,128,256,512,1024. The conc-16 point also makes the
-    # multi-node eval policy (8k1k + conc >= 16) mark an lm-eval (eval-conc=16) —
-    # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end.
+      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
+      # conc 1,2,4,8,16,32,64,128,256.
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+    # 8k1k disagg sweep across three P/D layouts (1P TP8 + 1D TP8 conc 1..1024;
+    # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024). The multi-node
+    # eval policy (8k1k + conc >= 16) marks one lm-eval on the highest-max-conc
+    # layout (TP8+TP8, eval-conc=median=128) — validating the M3 MoRI-IO disagg
+    # pipeline's correctness end-to-end.
     - isl: 8192
       osl: 1024
       search-space:
@@ -2789,3 +2825,38 @@ minimaxm3-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
+      # conc 1,2,4,8,16,32,64,128,256.
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml
index e78b6c647..a566fe449 100644
--- a/benchmarks/multi_node/amd_utils/models_vllm.yaml
+++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml
@@ -44,7 +44,9 @@ gpt-oss-120b:
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"
 
 MiniMax-M3-MXFP8:
-  # MiniMax-M3 MXFP8 disagg smoke test (TP8 prefill + TP8 decode, no EP).
+  # MiniMax-M3 MXFP8 disagg, no EP. The --tensor-parallel-size 8 below is just a
+  # placeholder: server_vllm.sh sed-rewrites it to PREFILL_TP_SIZE/DECODE_TP_SIZE
+  # from the master-config prefill/decode tp (the sweep mixes TP8 and TP4 layouts).
   # --block-size 128 is mandatory (MSA sparse/index cache); text-only benchmark
   # so --language-model-only frees the vision encoder. gfx950 uses FP8 KV cache.
   prefill_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice"

From 5c06ea75b8e9cb70135378e9e5ce318c9b6c7847 Mon Sep 17 00:00:00 2001
From: TianDi101 <ditian12@amd.com>
Date: Fri, 19 Jun 2026 11:36:16 +0000
Subject: [PATCH 13/20] feat(amd-disagg): bundle heterogeneous-TP + dup-ack
 fixes into unified MoRIIO diff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces moriio-kv-layout-fix.diff with moriio-minimax-m3-disagg.diff, which
bundles three layered fixes for the stock minimax-m3 vLLM image:
1. KV-layout: axis-aware per-layer block offsets (the gsm8k 0.0008→0.958 fix,
   required for homogeneous TP too).
2. heterogeneous-TP addressing + guard: maps each decode rank to the correct
   prefill rank (tp_rank // ratio) for PREFILL_TP_SIZE != DECODE_TP_SIZE, and
   raises NotImplementedError for unsupported cases (prefill-TP > decode-TP,
   KV-head splitting) instead of silently corrupting KV.
3. dup-ack fan-in: with DECODE_TP_SIZE > PREFILL_TP_SIZE, producer counts ACKs
   per transfer_id and only frees KV blocks once all expected consumers ACK,
   preventing both the late-ACK EngineCore crash and KV reuse before slower
   decode ranks finish reading.

job.slurm and patches/README.md updated to reference the new diff name.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/multi_node/amd_utils/job.slurm     |  27 +-
 .../multi_node/amd_utils/patches/README.md    |  37 +-
 .../patches/moriio/moriio-kv-layout-fix.diff  | 218 --------
 .../moriio/moriio-minimax-m3-disagg.diff      | 479 ++++++++++++++++++
 4 files changed, 535 insertions(+), 226 deletions(-)
 delete mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff
 create mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 727f64632..1a546b361 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -97,14 +97,35 @@ fi
 # diff needs no extra mount -- just an in-container `patch` call. A failed
 # apply aborts the container: silently running unpatched would silently
 # corrupt accuracy, not just skip a feature.
-_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff"
+#
+# The single diff bundles three layered fixes (all in patches/moriio/,
+# see patches/README.md):
+#   1. KV-layout: the load-bearing accuracy fix (axis-aware per-layer offsets;
+#      gsm8k 0.0008 -> 0.958). Required for homogeneous TP too.
+#   2. heterogeneous-TP (no-op for homogeneous TP, required for
+#      PREFILL_TP_SIZE != DECODE_TP_SIZE -- see nvidia/amd-master.yaml's TP4+TP8
+#      configs): handshake/notify port addressing maps each decode rank to the
+#      correct prefill rank instead of its own raw tp_rank (stock
+#      MoRIIOConnector has no fan-out concept at all), and guards (fail loud)
+#      the KV-head-split / prefill-TP>decode-TP cases MoRIIO can't serve.
+#   3. dup-ack: with DECODE_TP_SIZE > PREFILL_TP_SIZE, N decode ranks fan in to
+#      1 prefill rank and each sends its own completion ack for the same
+#      transfer_id. Freeing KV blocks on the first ack (the original
+#      MoRIIOConnector behavior) both crashes EngineCore on the late second ack
+#      (AssertionError in Scheduler._update_from_kv_xfer_finished) and risks
+#      silently corrupting KV if the slower decode rank's read is still in
+#      flight when the blocks are reused. Fix mirrors NIXL's
+#      consumer_notification_counts_by_req: producer counts acks per
+#      transfer_id (consumer embeds its own tp_size in the notify message) and
+#      only frees once all expected consumers have acked.
+_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff"
 _MORIIO_PATCH_CMD=""
 if [[ "${MORIIO_KV_PATCH:-auto}" != "skip" ]] \
    && [[ -f "$_MORIIO_DIFF" ]] \
    && [[ "${DOCKER_IMAGE_NAME:-}" == *"minimax-m3"* ]] \
    && [[ "${DOCKER_IMAGE_NAME:-}" != *"hetkv"* ]]; then
-    _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff || exit 1"
-    echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout diff inside container: ${_MORIIO_DIFF}"
+    _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff || exit 1"
+    echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout + heterogeneous-TP + dup-ack diff inside container: ${_MORIIO_DIFF}"
 fi
 
 xP="${xP:-1}"
diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md
index 27f9fc81d..a75f38854 100644
--- a/benchmarks/multi_node/amd_utils/patches/README.md
+++ b/benchmarks/multi_node/amd_utils/patches/README.md
@@ -84,8 +84,9 @@ container behavior is byte-identical to the unpatched path.
 
 ## `moriio/` (vLLM MoRIIO connector, MiniMax-M3)
 
-A unified diff (`moriio-kv-layout-fix.diff`), applied with `patch -p1`
-against the vLLM package dir inside the container, touching three files:
+A single unified diff (`moriio-minimax-m3-disagg.diff`), applied with
+`patch -p1` against the vLLM package dir inside the container, touching
+three files:
 
 ```
 /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/moriio/
@@ -117,10 +118,36 @@ indexer cache), `num_blocks = shape[0]`; the WRITE path memoizes offsets
 per geometry. Result: disagg gsm8k `strict-match 0.9583 /
 flexible-extract 0.9575` (matches single-node). Homogeneous models
 (uniform layout) are unaffected — one geometry, one offset set, same
-result. Heterogeneous-TP P/D (prefill TP ≠ decode TP) is still a TODO
-(same as upstream). Full write-up in
+result. Full write-up in
 `/apps/ditian12/m3_disagg_manual/moriio_hetkv_fix/README.md`.
 
+The diff also bundles two heterogeneous-TP layers (no-op for homogeneous
+TP, exercised by `nvidia/amd-master.yaml`'s TP4-prefill + TP8-decode
+configs):
+
+- **heterogeneous-TP addressing + guard:** stock MoRIIOConnector always
+  addresses remote rank == local `tp_rank`, which has no listener once
+  `DECODE_TP_SIZE > PREFILL_TP_SIZE`. `_remote_tp_rank` maps each decode
+  rank to the prefill rank holding its KV head (`tp_rank // ratio`,
+  mirroring NIXL's `TpKVTopology.get_target_remote_ranks`). This is
+  byte-correct only when KV heads are **replicated** (`tp_size >=
+  total_kv_heads`, i.e. ≤1 distinct head per rank — MiniMax-M3 has 4 KV
+  heads, so any TP≥4 is replicated). The cases MoRIIO can't serve —
+  prefill TP > decode TP (needs multi-rank fan-in) and KV-head splitting
+  (`total_kv_heads > prefill_tp`, which would need per-head slicing of the
+  NHD layout, unrepresentable as one `(offset,len)` per block) — now
+  **raise `NotImplementedError`** in `_compute_block_transfer_offsets`
+  instead of silently transferring corrupt KV. (NIXL likewise only splits
+  heads in HND layout and raises otherwise.)
+- **dup-ack fan-in:** with `DECODE_TP_SIZE > PREFILL_TP_SIZE`, N decode
+  ranks read from one prefill rank and each ACKs the same `transfer_id`.
+  The producer now counts ACKs per `transfer_id` (consumer embeds its own
+  `tp_size` in the notify payload) and only reports `finished_sending`
+  once all expected consumers have ACKed — preventing both the late-ACK
+  `EngineCore` crash and freeing/reusing KV blocks while a slower decode
+  rank is still reading. Mirrors NIXL's
+  `consumer_notification_counts_by_req`.
+
 ### How to enable
 
 `job.slurm` auto-applies this diff when `DOCKER_IMAGE_NAME` contains
@@ -131,7 +158,7 @@ caller sets `MORIIO_KV_PATCH=skip`. To wire it by hand (e.g. the
 
 ```bash
 patch -p1 -d /usr/local/lib/python3.12/dist-packages \
-  < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff
+  < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
 ```
 
 (`$DI_REPO_DIR` is the InferenceX checkout root that `job.slurm` already
diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff
deleted file mode 100644
index 7f6c435bf..000000000
--- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff
+++ /dev/null
@@ -1,218 +0,0 @@
-diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
-index 73694ce32..a30d30af8 100644
---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
-@@ -80,6 +80,10 @@ class RemoteAllocInfo:
-     writes_done: int = 0
-     decode_dp_rank: int = 0
-     transfer_offset: tuple[list[int], list[int], list[int]] | None = None
-+    # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for
-+    # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a
-+    # single entry. See MoRIIOWriter._prepare_transfer_plan.
-+    transfer_offsets: dict = field(default_factory=dict)
- 
- 
- class ROLE(Enum):
-diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
-index 167eef6e1..1846a3c21 100644
---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
-@@ -1233,8 +1233,10 @@ class MoRIIOConnectorWorker:
-             block_size, kv_latent_dim = block_shape
-             self.slot_size_bytes = kv_elem_size * kv_latent_dim
-         else:
--            # [2 (k and v), num_blocks, ...]
--            self.num_blocks = first_kv_cache.shape[1]
-+            # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V
-+            # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read
-+            # shape[1] here, which is the size-2 K/V axis, not num_blocks.)
-+            self.num_blocks = first_kv_cache.shape[0]
-             block_rank = 3  # [block_size, kv_heads, head_dim]
-             block_shape = first_kv_cache.shape[-block_rank:]
-             block_size, n_kv_heads, head_dim = block_shape[-3:]
-@@ -1257,10 +1259,17 @@ class MoRIIOConnectorWorker:
-         caches_data = []
- 
-         for cache_or_caches in kv_caches.values():
--            cache_list = [cache_or_caches] if use_mla else cache_or_caches
-+            # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs
-+            # rank-5 (full attention, [K, V]). A single global use_mla flag
-+            # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for
-+            # hybrid models, so detect per cache. region_len is the actual tensor
-+            # (or K/V half) byte size -- equivalent to num_blocks * block_len for
-+            # homogeneous models, correct for heterogeneous ones.
-+            cache_is_mla = cache_or_caches.dim() == 3
-+            cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches
-             for cache in cache_list:
-                 base_addr = cache.data_ptr()
--                region_len = self.num_blocks * self.block_len
-+                region_len = cache.numel() * cache.element_size()
-                 caches_data.append((base_addr, region_len, cache.device.index, ""))
-                 kv_caches_base_addr.append(base_addr)
- 
-@@ -1665,21 +1674,53 @@ class MoRIIOConnectorWorker:
-             Tuple of (local_offsets, remote_offsets, transfer_sizes)
-         """
-         assert self.kv_cache_shape is not None, "KV caches shape not initialized"
--        is_mla = len(self.kv_cache_shape) == 3
--        stride = self.kv_caches[layer_name].stride()
--        sz = self.kv_caches[layer_name].element_size()
--        if is_mla:
--            blknum, blksize, hs = self.kv_cache_shape
--            hn = 1
--            block_stride = stride[0]
--        else:
--            _, blknum, blksize, hn, hs = self.kv_cache_shape
--            local_ktov_stride = stride[0]
--            block_stride = stride[1]
--            remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks
-+        # Per-layer, axis-aware geometry.
-+        #
-+        # The KV tensors vLLM hands the connector are laid out (verified on
-+        # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1):
-+        #   * main attention (GQA, dense + sparse layers):
-+        #       shape (num_blocks, 2, block_size, num_kv_heads, head_dim)
-+        #       -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0.
-+        #   * sparse lightning-indexer (key-only, bf16):
-+        #       shape (num_blocks, block_size, head_dim)  -- rank 3, no K/V axis.
-+        #
-+        # The legacy code assumed the FlashAttention-style [2, num_blocks, ...]
-+        # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V
-+        # stride) -- transposing block vs K/V so every block read the wrong
-+        # region (corruption invisible to throughput-only benchmarks). Instead,
-+        # detect the block axis (size == num_blocks) and the optional K/V axis
-+        # (size 2) from THIS layer's own shape, and derive strides from them. The
-+        # per-block stride is independent of num_blocks, so no remote-num_blocks
-+        # scaling is needed (homogeneous P/D TP; heterogeneous TP still TODO).
-+        layer_cache = self.kv_caches[layer_name]
-+        layer_shape = tuple(layer_cache.shape)
-+        stride = layer_cache.stride()
-+        sz = layer_cache.element_size()
-+        rank = len(layer_shape)
-+
-+        # K/V axis = the size-2 axis among the two outermost dims (if any).
-+        kv_axis: int | None = None
-+        if rank >= 4:
-+            if layer_shape[0] == 2:
-+                kv_axis = 0
-+            elif layer_shape[1] == 2:
-+                kv_axis = 1
-+        # Block axis = outermost non-K/V axis (the one indexed by block_id).
-+        block_axis = 0
-+        if kv_axis == 0:
-+            block_axis = 1
-+        block_stride = stride[block_axis]
-+        kv_stride = stride[kv_axis] if kv_axis is not None else 0
-+        per_block = layer_shape[kv_axis] if kv_axis is not None else 1  # 2 (K,V) or 1
-+
-+        # One transferred slab = all dims except the block and K/V axes.
-+        slot_elems = 1
-+        for ax in range(rank):
-+            if ax == block_axis or ax == kv_axis:
-+                continue
-+            slot_elems *= layer_shape[ax]
-+        transfer_size_byte = slot_elems * sz
- 
--        transfer_size_byte = blksize * hn * hs * sz
--        per_block = 1 if is_mla else 2
-         total = len(local_block_ids) * per_block
-         offset_local = [0] * total
-         offset_remote = [0] * total
-@@ -1688,17 +1729,9 @@ class MoRIIOConnectorWorker:
-         w = 0
-         for i, lb in enumerate(local_block_ids):
-             rb = remote_block_ids[i]
--            # K
--            offset_local[w] = sz * (lb * block_stride)
--            offset_remote[w] = sz * (rb * block_stride)
--            w += 1
--            if not is_mla:
--                # V
--                # Handle num_block variations originating from PD (different kv strides)
--                # TODO: address block_sz differences in heterogeneous TP scenarios
--                # In MLA, we don't need to consider these two cases.
--                offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride)
--                offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride)
-+            for kv in range(per_block):
-+                offset_local[w] = sz * (lb * block_stride + kv * kv_stride)
-+                offset_remote[w] = sz * (rb * block_stride + kv * kv_stride)
-                 w += 1
- 
-         merged_l, merged_r, merged_s = self.merge_contiguous_blocks(
-@@ -1722,15 +1755,26 @@ class MoRIIOConnectorWorker:
-         dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0)
-         sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id)
- 
--        first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0]
--        offs = self._compute_block_transfer_offsets(
--            first_layer, local_block_ids, remote_block_ids, remote_moriio_meta
--        )
--
--        for layer_name in self.layer_name_to_local_kv_cache_metadata:
--            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
--                layer_name
-+        # Heterogeneous-KV models register layers with different shapes/dtypes in
-+        # a single KV-cache group sharing one block table, so block_ids match
-+        # across layers but per-block byte geometry does not. Compute offsets per
-+        # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3
-+        # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing.
-+        layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys())
-+        offs_by_geom: dict = {}
-+        for sess_idx, layer_name in enumerate(layer_names):
-+            layer_cache = self.kv_caches[layer_name]
-+            geom_key = (
-+                tuple(layer_cache.shape),
-+                tuple(layer_cache.stride()),
-+                layer_cache.dtype,
-             )
-+            offs = offs_by_geom.get(geom_key)
-+            if offs is None:
-+                offs = self._compute_block_transfer_offsets(
-+                    layer_name, local_block_ids, remote_block_ids, remote_moriio_meta
-+                )
-+                offs_by_geom[geom_key] = offs
-             # TODO : apply multi-session batch-read when moriio support it
-             transfer_status = self.moriio_wrapper.read_remote_data(
-                 offs[2], offs[0], offs[1], sessions[sess_idx]
-diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
-index 3ca5f37ca..113eccad0 100644
---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
-@@ -279,21 +279,33 @@ class MoRIIOWriter:
-         Returns:
-             The transfer plan
-         """
--        # Compute offsets if not cached
--        if request_info.transfer_offset is None:
-+        # Compute offsets per distinct layer geometry. Heterogeneous-KV models
-+        # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8
-+        # caches in one KV-cache group; caching a single offset set per request
-+        # and reusing it for every layer corrupts the indexer cache. Block_ids
-+        # are shared (single block table), so offsets depend only on the layer's
-+        # shape/stride/dtype -- memoize by that geometry key.
-+        layer_cache = self.worker.kv_caches[task.layer_name]
-+        geom_key = (
-+            tuple(layer_cache.shape),
-+            tuple(layer_cache.stride()),
-+            layer_cache.dtype,
-+        )
-+        offsets = request_info.transfer_offsets.get(geom_key)
-+        if offsets is None:
-             offsets = self.worker._compute_block_transfer_offsets(
-                 task.layer_name,
-                 task.local_block_ids,
-                 request_info.block_ids,
-                 remote_moriio_meta,
-             )
--            request_info.transfer_offset = offsets
-+            request_info.transfer_offsets[geom_key] = offsets
- 
-         # Get session index
-         layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys())
-         sess_idx = layer_names.index(task.layer_name)
- 
--        local_off, remote_off, sizes = request_info.transfer_offset
-+        local_off, remote_off, sizes = offsets
- 
-         return LayerTransferPlan(
-             request_id=task.request_id,
diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
new file mode 100644
index 000000000..700cf26c3
--- /dev/null
+++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
@@ -0,0 +1,479 @@
+--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
+@@ -80,6 +80,10 @@
+     writes_done: int = 0
+     decode_dp_rank: int = 0
+     transfer_offset: tuple[list[int], list[int], list[int]] | None = None
++    # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for
++    # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a
++    # single entry. See MoRIIOWriter._prepare_transfer_plan.
++    transfer_offsets: dict = field(default_factory=dict)
+ 
+ 
+ class ROLE(Enum):
+--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
+@@ -740,6 +740,21 @@
+         # Completions that arrived before transfer_id_to_request_id was populated.
+         # Retried each step until the mapping is established.
+         self._unmatched_write_completions: set[str] = set()
++        # Producer side: with heterogeneous-TP P/D (DECODE_TP_SIZE >
++        # PREFILL_TP_SIZE), more than one decode rank reads this rank's KV
++        # slice and each sends its own completion notify for the SAME
++        # transfer_id once its own read finishes. We must not report
++        # finished_sending (which lets the core scheduler free/reuse the
++        # blocks) until *all* of them have acked -- otherwise a still-in-
++        # flight slower reader can read corrupted/reused memory. This counts
++        # per-transfer_id notifies against the expected fan-in count (derived
++        # from the consumer's own tp_size, sent alongside the transfer_id --
++        # see send_notify call sites) and only resolves once it's complete.
++        # Mirrors NIXL's consumer_notification_counts_by_req. Pruned in
++        # start_load_kv() once a transfer_id drops out of the live mapping
++        # (e.g. force-freed by the scheduler's defer_timeout without ever
++        # reaching full count).
++        self._consumer_notification_counts: dict[str, int] = {}
+ 
+         role = "producer" if self.is_producer else "consumer"
+         engine_suffix = (
+@@ -1085,6 +1100,30 @@
+                         req_id.decode(),
+                     )
+ 
++    def _remote_tp_rank(self, remote_tp_size: int) -> int:
++        """Map this worker's local tp_rank to the remote tp_rank it must
++        address when local and remote TP sizes differ (heterogeneous-TP P/D,
++        e.g. PREFILL_TP_SIZE=4 / DECODE_TP_SIZE=8).
++
++        vLLM replicates KV heads across TP ranks in groups of size
++        local_tp_size/remote_tp_size (see ModelConfig.get_num_kv_heads:
++        max(1, total_kv_heads // tp_size)), so every local rank within a
++        replica group must address the SAME single remote rank --
++        floor(local_tp_rank / ratio) -- instead of its own raw tp_rank, which
++        has no listener once local tp_size > remote tp_size. Mirrors vLLM's
++        NIXL connector (TpKVTopology.get_target_remote_ranks). The reverse
++        case (remote tp_size > local tp_size, e.g. P-TP > D-TP) would need
++        multi-rank fan-in reads and is not handled here.
++        """
++        if remote_tp_size == self.world_size:
++            return self.tp_rank
++        assert self.world_size % remote_tp_size == 0, (
++            f"local tp_size {self.world_size} must be a multiple of remote "
++            f"tp_size {remote_tp_size} for heterogeneous-TP P/D (remote "
++            "tp_size > local tp_size is not supported)"
++        )
++        return self.tp_rank // (self.world_size // remote_tp_size)
++
+     def _moriio_handshake(
+         self,
+         host: str,
+@@ -1101,7 +1140,9 @@
+         # a hack to keep us moving. We will switch when moving to etcd
+         # or where we have a single ZMQ socket in the scheduler.
+ 
+-        port_offset = get_port_offset(remote_dp_rank, self.tp_rank)
++        port_offset = get_port_offset(
++            remote_dp_rank, self._remote_tp_rank(remote_tp_size)
++        )
+         path = make_zmq_path("tcp", host, port + port_offset)
+         logger.debug("handshake Querying metadata on path: %s", path)
+ 
+@@ -1233,8 +1274,10 @@
+             block_size, kv_latent_dim = block_shape
+             self.slot_size_bytes = kv_elem_size * kv_latent_dim
+         else:
+-            # [2 (k and v), num_blocks, ...]
+-            self.num_blocks = first_kv_cache.shape[1]
++            # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V
++            # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read
++            # shape[1] here, which is the size-2 K/V axis, not num_blocks.)
++            self.num_blocks = first_kv_cache.shape[0]
+             block_rank = 3  # [block_size, kv_heads, head_dim]
+             block_shape = first_kv_cache.shape[-block_rank:]
+             block_size, n_kv_heads, head_dim = block_shape[-3:]
+@@ -1257,10 +1300,17 @@
+         caches_data = []
+ 
+         for cache_or_caches in kv_caches.values():
+-            cache_list = [cache_or_caches] if use_mla else cache_or_caches
++            # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs
++            # rank-5 (full attention, [K, V]). A single global use_mla flag
++            # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for
++            # hybrid models, so detect per cache. region_len is the actual tensor
++            # (or K/V half) byte size -- equivalent to num_blocks * block_len for
++            # homogeneous models, correct for heterogeneous ones.
++            cache_is_mla = cache_or_caches.dim() == 3
++            cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches
+             for cache in cache_list:
+                 base_addr = cache.data_ptr()
+-                region_len = self.num_blocks * self.block_len
++                region_len = cache.numel() * cache.element_size()
+                 caches_data.append((base_addr, region_len, cache.device.index, ""))
+                 kv_caches_base_addr.append(base_addr)
+ 
+@@ -1338,13 +1388,45 @@
+         done_sending, done_recving = set(), set()
+ 
+         if self.is_producer:
+-            # pop_finished_req_ids returns transfer_ids (the ZMQ payload sent
+-            # by decode via send_notify); map back to req_ids for the scheduler.
+-            finished_transfer_ids = self.moriio_wrapper.pop_finished_req_ids()
++            # pop_finished_req_ids returns every completion message received
++            # since the last call (NOT deduped -- with heterogeneous-TP
++            # fan-out, two different decode ranks legitimately send
++            # byte-identical messages for the same transfer_id and each one
++            # must be counted). Payload is "<transfer_id>:<consumer_tp_size>"
++            # (see send_notify call sites); plain transfer_id with no ":" is
++            # treated as a 1:1 ack (internal WRITE-mode completions).
++            finished_transfer_msgs = self.moriio_wrapper.pop_finished_req_ids()
++            resolved_transfer_ids: set[str] = set()
++            for raw_msg in finished_transfer_msgs:
++                xfer_id, _, tp_size_str = raw_msg.rpartition(":")
++                if not xfer_id:
++                    xfer_id, tp_size_str = raw_msg, str(self.world_size)
++                if xfer_id not in self.transfer_id_to_request_id:
++                    logger.warning(
++                        "Could not find %s in transfer_id_to_request_id "
++                        "lookup table. This could lead to a possible hang.",
++                        xfer_id,
++                    )
++                    continue
++                consumer_tp_size = int(tp_size_str)
++                if consumer_tp_size > self.world_size:
++                    assert consumer_tp_size % self.world_size == 0, (
++                        f"consumer tp_size {consumer_tp_size} must be a "
++                        f"multiple of producer tp_size {self.world_size} "
++                        "for heterogeneous-TP P/D"
++                    )
++                    expected_acks = consumer_tp_size // self.world_size
++                else:
++                    expected_acks = 1
++                count = self._consumer_notification_counts.get(xfer_id, 0) + 1
++                if count >= expected_acks:
++                    self._consumer_notification_counts.pop(xfer_id, None)
++                    resolved_transfer_ids.add(xfer_id)
++                else:
++                    self._consumer_notification_counts[xfer_id] = count
+             done_sending = {
+                 self.transfer_id_to_request_id[xfer_id]
+-                for xfer_id in finished_transfer_ids
+-                if xfer_id in self.transfer_id_to_request_id
++                for xfer_id in resolved_transfer_ids
+             }
+         else:
+             if self.mode == MoRIIOMode.WRITE:
+@@ -1389,7 +1471,13 @@
+                 if last.Succeeded():
+                     host, port, xfer_id = self._recving_transfers_callback_addr[req_id]
+                     done_req_ids.add(xfer_id)
+-                    self.moriio_wrapper.send_notify(xfer_id, host, port)
++                    # Embed our own tp_size so the producer can tell, with
++                    # heterogeneous-TP fan-out, how many consumer acks to
++                    # expect for this transfer_id before it's safe to free
++                    # the blocks (see _consumer_notification_counts).
++                    self.moriio_wrapper.send_notify(
++                        f"{xfer_id}:{self.world_size}", host, port
++                    )
+                     to_remove.append(req_id)
+                 elif last.Failed():
+                     logger.error(
+@@ -1402,7 +1490,9 @@
+                     )
+                     host, port, xfer_id = self._recving_transfers_callback_addr[req_id]
+                     try:
+-                        self.moriio_wrapper.send_notify(xfer_id, host, port)
++                        self.moriio_wrapper.send_notify(
++                            f"{xfer_id}:{self.world_size}", host, port
++                        )
+                     except Exception:
+                         logger.exception(
+                             "Failed to send error notification for request %s",
+@@ -1488,6 +1578,15 @@
+         """
+         self.transfer_id_to_request_id = metadata.transfer_id_to_request_id
+         if self.is_producer:
++            # Drop counts for transfer_ids that dropped out of the live
++            # mapping without ever reaching full ack count (e.g. force-freed
++            # by the scheduler's defer_timeout) -- they can never resolve via
++            # get_finished() anymore, so stop tracking them to bound memory.
++            self._consumer_notification_counts = {
++                xfer_id: count
++                for xfer_id, count in self._consumer_notification_counts.items()
++                if xfer_id in self.transfer_id_to_request_id
++            }
+             self.moriio_wrapper.async_wait_reqid()
+             return
+         if self.mode == MoRIIOMode.WRITE:
+@@ -1560,6 +1659,7 @@
+             remote_block_ids=meta.remote_block_ids,
+             remote_host=meta.remote_host,
+             remote_notify_port=meta.remote_notify_port,
++            remote_tp_size=meta.tp_size,
+         )
+ 
+     def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_layer):
+@@ -1653,6 +1753,7 @@
+         local_block_ids: list[int],
+         remote_block_ids: list[int],
+         remote_moriio_meta: MoRIIOAgentMetadata,
++        remote_tp_size: int | None = None,
+     ) -> tuple[list[int], list[int], list[int]]:
+         """Compute transfer offsets for block data.
+ 
+@@ -1661,25 +1762,110 @@
+             local_block_ids: IDs of local blocks
+             remote_block_ids: IDs of remote blocks
+             remote_moriio_meta: Metadata of the remote MoRIIO agent
++            remote_tp_size: tp_size of the remote (producer/prefill) instance.
++                Defaults to this worker's world_size (homogeneous P/D TP). When
++                it differs, used to validate that KV heads are replicated (the
++                only heterogeneous-TP regime MoRIIO supports) -- see the guard
++                below.
+         Returns:
+             Tuple of (local_offsets, remote_offsets, transfer_sizes)
+         """
+         assert self.kv_cache_shape is not None, "KV caches shape not initialized"
+-        is_mla = len(self.kv_cache_shape) == 3
+-        stride = self.kv_caches[layer_name].stride()
+-        sz = self.kv_caches[layer_name].element_size()
+-        if is_mla:
+-            blknum, blksize, hs = self.kv_cache_shape
+-            hn = 1
+-            block_stride = stride[0]
+-        else:
+-            _, blknum, blksize, hn, hs = self.kv_cache_shape
+-            local_ktov_stride = stride[0]
+-            block_stride = stride[1]
+-            remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks
++        # Per-layer, axis-aware geometry.
++        #
++        # The KV tensors vLLM hands the connector are laid out (verified on
++        # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1):
++        #   * main attention (GQA, dense + sparse layers):
++        #       shape (num_blocks, 2, block_size, num_kv_heads, head_dim)
++        #       -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0.
++        #   * sparse lightning-indexer (key-only, bf16):
++        #       shape (num_blocks, block_size, head_dim)  -- rank 3, no K/V axis.
++        #
++        # The legacy code assumed the FlashAttention-style [2, num_blocks, ...]
++        # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V
++        # stride) -- transposing block vs K/V so every block read the wrong
++        # region (corruption invisible to throughput-only benchmarks). Instead,
++        # detect the block axis (size == num_blocks) and the optional K/V axis
++        # (size 2) from THIS layer's own shape, and derive strides from them. The
++        # per-block stride is independent of num_blocks, so no remote-num_blocks
++        # scaling is needed.
++        layer_cache = self.kv_caches[layer_name]
++        layer_shape = tuple(layer_cache.shape)
++        stride = layer_cache.stride()
++        sz = layer_cache.element_size()
++        rank = len(layer_shape)
++
++        # K/V axis = the size-2 axis among the two outermost dims (if any).
++        kv_axis: int | None = None
++        if rank >= 4:
++            if layer_shape[0] == 2:
++                kv_axis = 0
++            elif layer_shape[1] == 2:
++                kv_axis = 1
++        # Block axis = outermost non-K/V axis (the one indexed by block_id).
++        block_axis = 0
++        if kv_axis == 0:
++            block_axis = 1
++        block_stride = stride[block_axis]
++        kv_stride = stride[kv_axis] if kv_axis is not None else 0
++        per_block = layer_shape[kv_axis] if kv_axis is not None else 1  # 2 (K,V) or 1
++
++        # One transferred slab = all dims except the block and K/V axes.
++        slot_elems = 1
++        for ax in range(rank):
++            if ax == block_axis or ax == kv_axis:
++                continue
++            slot_elems *= layer_shape[ax]
++
++        # --- Heterogeneous-TP guard (mirrors NIXL add_remote_agent) -----------
++        # When local (decode) and remote (prefill) tp_size differ, _remote_tp_rank
++        # maps each local rank to the single remote rank it reads from. That
++        # whole-block read is byte-correct *only* when KV heads are REPLICATED,
++        # i.e. the remote rank holds exactly the head(s) this rank owns and no
++        # more. vLLM replicates KV heads whenever tp_size >= total_kv_heads
++        # (ModelConfig.get_num_kv_heads -> max(1, total//tp)); with the
++        # r // tp_ratio rank mapping, head ownership then lines up exactly, so
++        # no head offset is needed. This is MiniMax-M3's regime (4 KV heads,
++        # TP>=4) and is the only heterogeneous-TP case MoRIIO supports.
++        #
++        # If instead a remote rank packs MORE distinct KV heads than this local
++        # rank owns (total_kv_heads > remote_tp, i.e. heads are SPLIT on the
++        # producer), each fan-in rank would have to read only its head slice of
++        # the remote block. MoRIIO's per-block tensors are NHD
++        # ([block_size, kv_heads, head_dim] -- heads interleaved per token), so
++        # a head slice is NOT a contiguous sub-region and cannot be expressed as
++        # a single (offset, len) per block. NIXL only supports head splitting in
++        # HND layout and raises otherwise; we do the same -- fail loud rather
++        # than silently corrupt KV. MLA / rank-3 indexer caches are always
++        # replicated (no K/V axis) and never hit this path.
++        local_tp = self.world_size
++        remote_tp = remote_tp_size if remote_tp_size is not None else local_tp
++        if remote_tp != local_tp and not self.use_mla and kv_axis is not None:
++            total_kv_heads = self.model_config.get_total_num_kv_heads()
++            if remote_tp > local_tp:
++                # Prefill TP > decode TP: this rank would need to fan IN reads
++                # from multiple remote ranks (NIXL's negative-tp_ratio path).
++                raise NotImplementedError(
++                    f"Heterogeneous-TP with remote (prefill) tp_size {remote_tp} "
++                    f"> local (decode) tp_size {local_tp} requires multi-rank "
++                    "fan-in reads, not supported by MoRIIOConnector."
++                )
++            remote_heads = max(1, total_kv_heads // remote_tp)
++            local_heads = max(1, total_kv_heads // local_tp)
++            if remote_heads > local_heads:
++                # KV heads are split (not replicated) on the producer -> would
++                # need NHD head slicing, which MoRIIO can't express per block.
++                raise NotImplementedError(
++                    f"Heterogeneous-TP head splitting (total_kv_heads "
++                    f"{total_kv_heads} > prefill tp_size {remote_tp}: "
++                    f"{remote_heads} heads/rank on prefill vs {local_heads} on "
++                    "decode) requires per-head slicing of an NHD KV layout, not "
++                    "supported by MoRIIOConnector. Use PREFILL_TP_SIZE >= "
++                    "total_kv_heads so KV heads are replicated."
++                )
++
++        transfer_size_byte = slot_elems * sz
+ 
+-        transfer_size_byte = blksize * hn * hs * sz
+-        per_block = 1 if is_mla else 2
+         total = len(local_block_ids) * per_block
+         offset_local = [0] * total
+         offset_remote = [0] * total
+@@ -1688,17 +1874,9 @@
+         w = 0
+         for i, lb in enumerate(local_block_ids):
+             rb = remote_block_ids[i]
+-            # K
+-            offset_local[w] = sz * (lb * block_stride)
+-            offset_remote[w] = sz * (rb * block_stride)
+-            w += 1
+-            if not is_mla:
+-                # V
+-                # Handle num_block variations originating from PD (different kv strides)
+-                # TODO: address block_sz differences in heterogeneous TP scenarios
+-                # In MLA, we don't need to consider these two cases.
+-                offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride)
+-                offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride)
++            for kv in range(per_block):
++                offset_local[w] = sz * (lb * block_stride + kv * kv_stride)
++                offset_remote[w] = sz * (rb * block_stride + kv * kv_stride)
+                 w += 1
+ 
+         merged_l, merged_r, merged_s = self.merge_contiguous_blocks(
+@@ -1715,6 +1893,7 @@
+         transfer_id: str,
+         remote_host: str,
+         remote_notify_port: int,
++        remote_tp_size: int,
+     ) -> None:
+         if self.mode == MoRIIOMode.WRITE:
+             return
+@@ -1722,15 +1901,30 @@
+         dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0)
+         sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id)
+ 
+-        first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0]
+-        offs = self._compute_block_transfer_offsets(
+-            first_layer, local_block_ids, remote_block_ids, remote_moriio_meta
+-        )
+-
+-        for layer_name in self.layer_name_to_local_kv_cache_metadata:
+-            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
+-                layer_name
++        # Heterogeneous-KV models register layers with different shapes/dtypes in
++        # a single KV-cache group sharing one block table, so block_ids match
++        # across layers but per-block byte geometry does not. Compute offsets per
++        # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3
++        # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing.
++        layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys())
++        offs_by_geom: dict = {}
++        for sess_idx, layer_name in enumerate(layer_names):
++            layer_cache = self.kv_caches[layer_name]
++            geom_key = (
++                tuple(layer_cache.shape),
++                tuple(layer_cache.stride()),
++                layer_cache.dtype,
+             )
++            offs = offs_by_geom.get(geom_key)
++            if offs is None:
++                offs = self._compute_block_transfer_offsets(
++                    layer_name,
++                    local_block_ids,
++                    remote_block_ids,
++                    remote_moriio_meta,
++                    remote_tp_size=remote_tp_size,
++                )
++                offs_by_geom[geom_key] = offs
+             # TODO : apply multi-session batch-read when moriio support it
+             transfer_status = self.moriio_wrapper.read_remote_data(
+                 offs[2], offs[0], offs[1], sessions[sess_idx]
+@@ -1739,6 +1933,6 @@
+                 self._recving_transfers[request_id].append(transfer_status)
+                 self._recving_transfers_callback_addr[request_id] = (
+                     remote_host,
+-                    str(remote_notify_port + self.tp_rank),
++                    str(remote_notify_port + self._remote_tp_rank(remote_tp_size)),
+                     transfer_id,
+                 )
+--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
+@@ -279,21 +279,33 @@
+         Returns:
+             The transfer plan
+         """
+-        # Compute offsets if not cached
+-        if request_info.transfer_offset is None:
++        # Compute offsets per distinct layer geometry. Heterogeneous-KV models
++        # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8
++        # caches in one KV-cache group; caching a single offset set per request
++        # and reusing it for every layer corrupts the indexer cache. Block_ids
++        # are shared (single block table), so offsets depend only on the layer's
++        # shape/stride/dtype -- memoize by that geometry key.
++        layer_cache = self.worker.kv_caches[task.layer_name]
++        geom_key = (
++            tuple(layer_cache.shape),
++            tuple(layer_cache.stride()),
++            layer_cache.dtype,
++        )
++        offsets = request_info.transfer_offsets.get(geom_key)
++        if offsets is None:
+             offsets = self.worker._compute_block_transfer_offsets(
+                 task.layer_name,
+                 task.local_block_ids,
+                 request_info.block_ids,
+                 remote_moriio_meta,
+             )
+-            request_info.transfer_offset = offsets
++            request_info.transfer_offsets[geom_key] = offsets
+ 
+         # Get session index
+         layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys())
+         sess_idx = layer_names.index(task.layer_name)
+ 
+-        local_off, remote_off, sizes = request_info.transfer_offset
++        local_off, remote_off, sizes = offsets
+ 
+         return LayerTransferPlan(
+             request_id=task.request_id,
+@@ -671,9 +683,14 @@
+             raise
+ 
+     def pop_finished_req_ids(self):
+-        # producer invocation: get the set of completed requests at the decode
++        # Producer invocation: get all completion messages received since the
++        # last call. Returned as a list, NOT deduped -- with heterogeneous-TP
++        # fan-out, two different decode ranks can send byte-identical
++        # messages for the same transfer_id, and the caller (get_finished())
++        # needs to count every individual occurrence to know when all
++        # expected consumers have acked.
+         with self.lock:
+-            done_send = set(self.done_req_ids)
++            done_send = list(self.done_req_ids)
+             self.done_req_ids = []
+         return done_send
+ 

From 79d137de418b31f7c6a2ee8268e54cb32e02fb6b Mon Sep 17 00:00:00 2001
From: TianDi101 <ditian12@amd.com>
Date: Fri, 19 Jun 2026 12:47:51 +0000
Subject: [PATCH 14/20] fix(moriio): correct _remote_tp_rank for prefill-TP >
 decode-TP (P8/D4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With P8/D4 and 4 KV heads, vLLM distributes heads across prefill ranks
in consecutive pairs: (rank0,rank1)→head0, (rank2,rank3)→head1, etc.
The previous patch used `return self.tp_rank` for the P>D branch, which
made decode rank 1 connect to prefill rank 1 (holds head0) instead of
prefill rank 2 (holds head1) — corrupting KV for all decode ranks except 0.

Fix: use `self.tp_rank * ratio` (ratio = remote_tp_size // local_tp_size),
the symmetric counterpart to the D>P case's `tp_rank // ratio`. This maps
each decode rank to the *first* prefill rank of its head group, which holds
the correct KV content via vLLM's replication scheme.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../multi_node/amd_utils/patches/README.md    |  29 +++--
 .../moriio/moriio-minimax-m3-disagg.diff      | 112 +++++++++---------
 2 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md
index a75f38854..4941540e3 100644
--- a/benchmarks/multi_node/amd_utils/patches/README.md
+++ b/benchmarks/multi_node/amd_utils/patches/README.md
@@ -128,17 +128,24 @@ configs):
 - **heterogeneous-TP addressing + guard:** stock MoRIIOConnector always
   addresses remote rank == local `tp_rank`, which has no listener once
   `DECODE_TP_SIZE > PREFILL_TP_SIZE`. `_remote_tp_rank` maps each decode
-  rank to the prefill rank holding its KV head (`tp_rank // ratio`,
-  mirroring NIXL's `TpKVTopology.get_target_remote_ranks`). This is
-  byte-correct only when KV heads are **replicated** (`tp_size >=
-  total_kv_heads`, i.e. ≤1 distinct head per rank — MiniMax-M3 has 4 KV
-  heads, so any TP≥4 is replicated). The cases MoRIIO can't serve —
-  prefill TP > decode TP (needs multi-rank fan-in) and KV-head splitting
-  (`total_kv_heads > prefill_tp`, which would need per-head slicing of the
-  NHD layout, unrepresentable as one `(offset,len)` per block) — now
-  **raise `NotImplementedError`** in `_compute_block_transfer_offsets`
-  instead of silently transferring corrupt KV. (NIXL likewise only splits
-  heads in HND layout and raises otherwise.)
+  rank to the correct single prefill rank. Two regimes, both requiring
+  **replicated** KV heads (`tp_size >= total_kv_heads`, ≤1 distinct head
+  per rank — MiniMax-M3 has 4 KV heads, so any TP≥4 is replicated):
+  - `D-TP > P-TP` (e.g. P4/D8): `tp_rank // ratio`, mirroring NIXL's
+    `TpKVTopology.get_target_remote_ranks`. Multiple decode ranks read
+    from one prefill rank.
+  - `P-TP > D-TP` (e.g. P8/D4): vLLM distributes heads across prefill
+    ranks in consecutive pairs — (rank0,rank1)→head0, (rank2,rank3)→head1,
+    etc. Decode rank k must connect to the **first** rank of its head group:
+    `tp_rank * ratio`. Using `tp_rank` directly (as the original patch did)
+    is wrong for ranks > 0: decode rank 1 lands on prefill rank 1 (holds
+    head0) instead of prefill rank 2 (holds head1), producing garbage KV.
+  The one unsupported case — KV-head **splitting** (`total_kv_heads >
+  prefill_tp`, where each prefill rank holds a distinct head subset that
+  a decode rank would need to slice from NHD layout, unrepresentable as a
+  single `(offset,len)` per block) — **raises `NotImplementedError`** in
+  `_compute_block_transfer_offsets`. (NIXL likewise only splits heads in
+  HND layout and raises otherwise.)
 - **dup-ack fan-in:** with `DECODE_TP_SIZE > PREFILL_TP_SIZE`, N decode
   ranks read from one prefill rank and each ACKs the same `transfer_id`.
   The producer now counts ACKs per `transfer_id` (consumer embeds its own
diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
index 700cf26c3..4835397b0 100644
--- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
+++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
@@ -35,38 +35,52 @@
  
          role = "producer" if self.is_producer else "consumer"
          engine_suffix = (
-@@ -1085,6 +1100,30 @@
+@@ -1085,6 +1100,40 @@
                          req_id.decode(),
                      )
  
 +    def _remote_tp_rank(self, remote_tp_size: int) -> int:
-+        """Map this worker's local tp_rank to the remote tp_rank it must
-+        address when local and remote TP sizes differ (heterogeneous-TP P/D,
-+        e.g. PREFILL_TP_SIZE=4 / DECODE_TP_SIZE=8).
++        """Map this worker's local tp_rank to the single remote tp_rank it must
++        address when local and remote TP sizes differ (heterogeneous-TP P/D).
 +
-+        vLLM replicates KV heads across TP ranks in groups of size
-+        local_tp_size/remote_tp_size (see ModelConfig.get_num_kv_heads:
-+        max(1, total_kv_heads // tp_size)), so every local rank within a
-+        replica group must address the SAME single remote rank --
-+        floor(local_tp_rank / ratio) -- instead of its own raw tp_rank, which
-+        has no listener once local tp_size > remote tp_size. Mirrors vLLM's
-+        NIXL connector (TpKVTopology.get_target_remote_ranks). The reverse
-+        case (remote tp_size > local tp_size, e.g. P-TP > D-TP) would need
-+        multi-rank fan-in reads and is not handled here.
++        Two regimes (both require KV heads to be REPLICATED, not split -- see
++        guard in _compute_block_transfer_offsets):
++
++        * decode-TP > prefill-TP (e.g. P4/D8): multiple decode ranks (in groups
++          of ratio = decode_tp // prefill_tp) share one prefill rank's KV slice.
++          floor(local_tp_rank / ratio) maps each decode rank to its prefill rank.
++          Mirrors NIXL TpKVTopology.get_target_remote_ranks.
++        * prefill-TP > decode-TP (e.g. P8/D4): vLLM distributes 4 KV heads
++          across 8 prefill ranks in consecutive pairs -- (rank0,rank1)→head0,
++          (rank2,rank3)→head1, etc. Each decode rank must address the FIRST rank
++          of its paired group: local_tp_rank * ratio (NOT the same-indexed rank,
++          which would land in the wrong head's group for ranks > 0).
++          Head-splitting is rejected in _compute_block_transfer_offsets.
 +        """
 +        if remote_tp_size == self.world_size:
 +            return self.tp_rank
++        if remote_tp_size > self.world_size:
++            # Prefill-TP > decode-TP (e.g. P8/D4, replicated KV heads).
++            # vLLM pairs prefill ranks per head: decode rank k must connect to
++            # the first prefill rank of its head group (k * ratio), NOT rank k.
++            # Example (P8/D4, 4 KV heads): decode rank 1 (head1) → prefill
++            # rank 2 (not rank 1, which holds head0 alongside rank 0).
++            assert remote_tp_size % self.world_size == 0, (
++                f"remote tp_size {remote_tp_size} must be a multiple of local "
++                f"tp_size {self.world_size} for heterogeneous-TP P/D"
++            )
++            return self.tp_rank * (remote_tp_size // self.world_size)
++        # Decode-TP > prefill-TP: floor-map multiple decode ranks to one prefill rank.
 +        assert self.world_size % remote_tp_size == 0, (
 +            f"local tp_size {self.world_size} must be a multiple of remote "
-+            f"tp_size {remote_tp_size} for heterogeneous-TP P/D (remote "
-+            "tp_size > local tp_size is not supported)"
++            f"tp_size {remote_tp_size} for heterogeneous-TP P/D"
 +        )
 +        return self.tp_rank // (self.world_size // remote_tp_size)
 +
      def _moriio_handshake(
          self,
          host: str,
-@@ -1101,7 +1140,9 @@
+@@ -1101,7 +1150,9 @@
          # a hack to keep us moving. We will switch when moving to etcd
          # or where we have a single ZMQ socket in the scheduler.
  
@@ -77,7 +91,7 @@
          path = make_zmq_path("tcp", host, port + port_offset)
          logger.debug("handshake Querying metadata on path: %s", path)
  
-@@ -1233,8 +1274,10 @@
+@@ -1233,8 +1284,10 @@
              block_size, kv_latent_dim = block_shape
              self.slot_size_bytes = kv_elem_size * kv_latent_dim
          else:
@@ -90,7 +104,7 @@
              block_rank = 3  # [block_size, kv_heads, head_dim]
              block_shape = first_kv_cache.shape[-block_rank:]
              block_size, n_kv_heads, head_dim = block_shape[-3:]
-@@ -1257,10 +1300,17 @@
+@@ -1257,10 +1310,17 @@
          caches_data = []
  
          for cache_or_caches in kv_caches.values():
@@ -110,7 +124,7 @@
                  caches_data.append((base_addr, region_len, cache.device.index, ""))
                  kv_caches_base_addr.append(base_addr)
  
-@@ -1338,13 +1388,45 @@
+@@ -1338,13 +1398,45 @@
          done_sending, done_recving = set(), set()
  
          if self.is_producer:
@@ -161,7 +175,7 @@
              }
          else:
              if self.mode == MoRIIOMode.WRITE:
-@@ -1389,7 +1471,13 @@
+@@ -1389,7 +1481,13 @@
                  if last.Succeeded():
                      host, port, xfer_id = self._recving_transfers_callback_addr[req_id]
                      done_req_ids.add(xfer_id)
@@ -176,7 +190,7 @@
                      to_remove.append(req_id)
                  elif last.Failed():
                      logger.error(
-@@ -1402,7 +1490,9 @@
+@@ -1402,7 +1500,9 @@
                      )
                      host, port, xfer_id = self._recving_transfers_callback_addr[req_id]
                      try:
@@ -187,7 +201,7 @@
                      except Exception:
                          logger.exception(
                              "Failed to send error notification for request %s",
-@@ -1488,6 +1578,15 @@
+@@ -1488,6 +1588,15 @@
          """
          self.transfer_id_to_request_id = metadata.transfer_id_to_request_id
          if self.is_producer:
@@ -203,7 +217,7 @@
              self.moriio_wrapper.async_wait_reqid()
              return
          if self.mode == MoRIIOMode.WRITE:
-@@ -1560,6 +1659,7 @@
+@@ -1560,6 +1669,7 @@
              remote_block_ids=meta.remote_block_ids,
              remote_host=meta.remote_host,
              remote_notify_port=meta.remote_notify_port,
@@ -211,7 +225,7 @@
          )
  
      def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_layer):
-@@ -1653,6 +1753,7 @@
+@@ -1653,6 +1763,7 @@
          local_block_ids: list[int],
          remote_block_ids: list[int],
          remote_moriio_meta: MoRIIOAgentMetadata,
@@ -219,7 +233,7 @@
      ) -> tuple[list[int], list[int], list[int]]:
          """Compute transfer offsets for block data.
  
-@@ -1661,25 +1762,110 @@
+@@ -1661,25 +1772,100 @@
              local_block_ids: IDs of local blocks
              remote_block_ids: IDs of remote blocks
              remote_moriio_meta: Metadata of the remote MoRIIO agent
@@ -291,43 +305,31 @@
 +            slot_elems *= layer_shape[ax]
 +
 +        # --- Heterogeneous-TP guard (mirrors NIXL add_remote_agent) -----------
-+        # When local (decode) and remote (prefill) tp_size differ, _remote_tp_rank
-+        # maps each local rank to the single remote rank it reads from. That
-+        # whole-block read is byte-correct *only* when KV heads are REPLICATED,
-+        # i.e. the remote rank holds exactly the head(s) this rank owns and no
-+        # more. vLLM replicates KV heads whenever tp_size >= total_kv_heads
-+        # (ModelConfig.get_num_kv_heads -> max(1, total//tp)); with the
-+        # r // tp_ratio rank mapping, head ownership then lines up exactly, so
-+        # no head offset is needed. This is MiniMax-M3's regime (4 KV heads,
-+        # TP>=4) and is the only heterogeneous-TP case MoRIIO supports.
++        # When P/D TP sizes differ, _remote_tp_rank maps each decode rank to a
++        # single remote rank; that whole-block read is byte-correct only when KV
++        # heads are REPLICATED on the remote (prefill) side.
++        #
++        # Supported regimes (replicated heads, i.e. remote_heads <= local_heads):
++        #   * D-TP > P-TP (e.g. P4/D8): multiple decode ranks share one prefill
++        #     rank's slice (floor-ratio mapping).
++        #   * P-TP > D-TP (e.g. P8/D4): each decode rank reads from same-indexed
++        #     prefill rank (self.tp_rank mapping). MiniMax-M3's regime: 4 KV heads
++        #     fully replicated at TP>=4.
 +        #
-+        # If instead a remote rank packs MORE distinct KV heads than this local
-+        # rank owns (total_kv_heads > remote_tp, i.e. heads are SPLIT on the
-+        # producer), each fan-in rank would have to read only its head slice of
-+        # the remote block. MoRIIO's per-block tensors are NHD
-+        # ([block_size, kv_heads, head_dim] -- heads interleaved per token), so
-+        # a head slice is NOT a contiguous sub-region and cannot be expressed as
-+        # a single (offset, len) per block. NIXL only supports head splitting in
-+        # HND layout and raises otherwise; we do the same -- fail loud rather
-+        # than silently corrupt KV. MLA / rank-3 indexer caches are always
-+        # replicated (no K/V axis) and never hit this path.
++        # Unsupported: heads SPLIT on prefill (remote_heads > local_heads).
++        # MoRIIO's NHD layout (heads interleaved per token) makes a head slice
++        # non-contiguous and inexpressible as a single (offset, len) per block.
++        # NIXL raises for the same reason; we do the same. MLA / rank-3 indexer
++        # caches are always replicated (no K/V axis) and bypass this guard.
 +        local_tp = self.world_size
 +        remote_tp = remote_tp_size if remote_tp_size is not None else local_tp
 +        if remote_tp != local_tp and not self.use_mla and kv_axis is not None:
 +            total_kv_heads = self.model_config.get_total_num_kv_heads()
-+            if remote_tp > local_tp:
-+                # Prefill TP > decode TP: this rank would need to fan IN reads
-+                # from multiple remote ranks (NIXL's negative-tp_ratio path).
-+                raise NotImplementedError(
-+                    f"Heterogeneous-TP with remote (prefill) tp_size {remote_tp} "
-+                    f"> local (decode) tp_size {local_tp} requires multi-rank "
-+                    "fan-in reads, not supported by MoRIIOConnector."
-+                )
 +            remote_heads = max(1, total_kv_heads // remote_tp)
 +            local_heads = max(1, total_kv_heads // local_tp)
 +            if remote_heads > local_heads:
-+                # KV heads are split (not replicated) on the producer -> would
-+                # need NHD head slicing, which MoRIIO can't express per block.
++                # KV heads are SPLIT on prefill -- whole-block read is incorrect.
++                # Applies in both TP-mismatch directions; fail loud.
 +                raise NotImplementedError(
 +                    f"Heterogeneous-TP head splitting (total_kv_heads "
 +                    f"{total_kv_heads} > prefill tp_size {remote_tp}: "
@@ -336,6 +338,8 @@
 +                    "supported by MoRIIOConnector. Use PREFILL_TP_SIZE >= "
 +                    "total_kv_heads so KV heads are replicated."
 +                )
++            # remote_heads <= local_heads: replicated. _remote_tp_rank selects the
++            # correct remote rank; whole-block read is byte-correct.
 +
 +        transfer_size_byte = slot_elems * sz
  

From db261e0fbb76a9704b2220ad9d8c14ccd3aa8a69 Mon Sep 17 00:00:00 2001
From: TianDi101 <ditian12@amd.com>
Date: Fri, 19 Jun 2026 13:03:09 +0000
Subject: [PATCH 15/20] fix(moriio-diff): correct hunk header count after
 _remote_tp_rank expansion

The P>D fix added 4 lines to _remote_tp_rank but the hunk header still
said +1100,40; patch aborted with "malformed patch at line 79". Update
to +1100,44 to match the actual 6 context + 38 added lines.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
index 4835397b0..83ae80d13 100644
--- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
+++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
@@ -35,7 +35,7 @@
  
          role = "producer" if self.is_producer else "consumer"
          engine_suffix = (
-@@ -1085,6 +1100,40 @@
+@@ -1085,6 +1100,44 @@
                          req_id.decode(),
                      )
  

From 09efb99c17f07aebfd93f96bbda91dd99557a77c Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Fri, 19 Jun 2026 14:38:47 +0000
Subject: [PATCH 16/20] fix(amd-disagg): keep MoRIIO patch cmd inside container
 bash -lc quotes

The MoRIIO KV-layout patch was injected into the per-node container launch
via '"${_MORIIO_PATCH_CMD:-}"', which breaks out of the outer
srun bash -c "..." double-quoted string. Because the patch command value
contains spaces and the shell operators '<' and '||', the unquoted
expansion word-split the generated container script, truncating it right
after the word `patch` and silently dropping the patch arguments AND the
server.sh launch. The container then exited 0:0 within seconds, producing
no benchmark/eval output -> collect_latest_results found "No logs
directory" -> the launch step failed with exit 1 (all minimax-m3 disagg
jobs affected).

Fix: expand ${_MORIIO_PATCH_CMD:-} directly inside the inner bash -lc
single quotes (no quote toggling), so the patch command stays intact and
its operators are parsed by the container shell. Validated end-to-end:
gsm8k recovers from ~0 (garbage) to 0.94-0.98 across P8D8/P4D8/P8D4.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 benchmarks/multi_node/amd_utils/job.slurm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 1a546b361..3afa103f2 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -640,7 +640,7 @@ fi
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         set -o pipefail
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
-        '"${_MORIIO_PATCH_CMD:-}"'
+        ${_MORIIO_PATCH_CMD:-}
         '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log
     '
 

From aad872a4c69482d702017d337d5bffee539b50f7 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Fri, 19 Jun 2026 14:23:59 -0400
Subject: [PATCH 17/20] disagg #1762: add 2P TP4 + 1D TP8 layout at conc
 256,512,768,1024 (1k1k & 8k1k)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an
8-GPU node) feeding one TP8 decode (DECODE_NODES=1) — 3 nodes total. Added to
both seq-len scenarios at conc 256,512,768,1024. Eval marking unchanged (still
one lm-eval on the 8k1k TP8+TP8 layout).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 46 +++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index aa4887ad0..ccce01f4f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2801,11 +2801,29 @@ minimaxm3-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
-    # 8k1k disagg sweep across three P/D layouts (1P TP8 + 1D TP8 conc 1..1024;
-    # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024). The multi-node
-    # eval policy (8k1k + conc >= 16) marks one lm-eval on the highest-max-conc
-    # layout (TP8+TP8, eval-conc=median=128) — validating the M3 MoRI-IO disagg
-    # pipeline's correctness end-to-end.
+      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
+      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
+      - spec-decoding: "none"
+        conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+    # 8k1k disagg sweep across four P/D layouts (1P TP8 + 1D TP8 conc 1..1024;
+    # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024; 2P TP4 + 1D TP8
+    # conc 256..1024). The multi-node eval policy (8k1k + conc >= 16) marks one
+    # lm-eval on the highest-max-conc layout (TP8+TP8, eval-conc=median=128) —
+    # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end.
     - isl: 8192
       osl: 1024
       search-space:
@@ -2860,3 +2878,21 @@ minimaxm3-fp8-mi355x-vllm-disagg:
           dp-attn: false
           additional-settings:
           - "DECODE_NODES=1"
+      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
+      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
+      - spec-decoding: "none"
+        conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"

From bbb0d78e57eb7aa2d01ef54df8761026fb3510c8 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Mon, 22 Jun 2026 13:52:09 +0000
Subject: [PATCH 18/20] chore(amd-disagg): remove redundant
 moriio_heterogeneous_kv.py patcher

The per-layer READ-offset fix this Python patcher applied to
moriio_connector.py is fully subsumed by the unified overlay
patches/moriio/moriio-minimax-m3-disagg.diff, which job.slurm applies
with `patch -p1` BEFORE server.sh sources setup_deps.sh. The diff
rewrites the exact lines the patcher searches for (the `first_layer`
single-offset block and the `is_mla = len(self.kv_cache_shape)` sizing),
with a stronger geometry-memoized + heterogeneous-TP-aware version, so
the patcher's OLD1/OLD2 patterns no longer match and it already no-ops
("pattern not found; skipping") in the real flow. It's also the same
fix now upstreamed in vLLM #46039 (READ mixed KV layouts).

Drop the dead patcher and its setup_deps.sh hook so the diff is the
single source of truth. patches/README.md only documents the diff (no
reference to this patcher), so no README change is needed.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../patches/moriio_heterogeneous_kv.py        | 145 ------------------
 benchmarks/multi_node/amd_utils/setup_deps.sh |  23 ---
 2 files changed, 168 deletions(-)
 delete mode 100644 benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py

diff --git a/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py b/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py
deleted file mode 100644
index a7ee8c724..000000000
--- a/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/usr/bin/env python3
-"""Patch vLLM's MoRIIOConnector to transfer heterogeneous KV caches per-layer.
-
-Why
----
-MiniMax-M3 (MiniMaxM3SparseForCausalLM) is a hybrid sparse-attention model:
-
-  * main attention layers register a ``FullAttentionSpec`` KV cache:
-      rank-5 ``[2, num_blocks, block_size, num_kv_heads, head_dim]``, **fp8**, K+V
-  * the lightning indexer (sparse layers) registers a separate
-    ``MLAAttentionSpec`` index cache (``MiniMaxM3IndexerCache``):
-      rank-3 ``[num_blocks, block_size, head_dim]``, **bf16**, key-only
-
-The upstream MoRIIOConnector assumes a *single uniform* KV layout: it derives
-``self.kv_cache_shape`` / ``block_len`` / ``element_size`` from the **first**
-cache, and ``_read_blocks`` computes the transfer offsets **once** from
-``first_layer`` and reuses them for **every** layer (see the in-code TODO
-"block_len needs to be per-layer for ... hybrid attn"). For M3 this transfers
-the bf16 key-only rank-3 index cache using the fp8 K+V rank-5 main-cache sizing,
-corrupting the indexer state on the decode worker. The sparse layers then select
-the wrong KV blocks and the model emits incoherent tokens (gsm8k ~= 0).
-
-This is the vLLM analogue of the already-shipped SGLang MoRI DSA fix in
-``patches/mori_conn.py`` (see patches/README.md).
-
-Fix
----
-Compute transfer geometry **per layer** from each layer's own tensor
-(``shape`` / ``stride`` / ``element_size`` / rank), instead of from the first
-cache. For homogeneous models every layer's geometry equals the first cache's,
-so behaviour is unchanged; only hybrid models (M3) are affected.
-
-Two minimal, targeted edits (READ path, which the M3 recipe uses with
-``read_mode: true``):
-
-  1. ``_compute_block_transfer_offsets`` -> use ``self.kv_caches[layer_name]``'s
-     own shape (rank/dims) instead of the global ``self.kv_cache_shape``.
-  2. ``_read_blocks`` -> call ``_compute_block_transfer_offsets`` inside the
-     per-layer loop instead of once for ``first_layer``.
-
-Idempotent: re-running detects the ``PATCHED heterogeneous-kv`` marker and exits.
-"""
-import os
-import sys
-
-
-def _default_target() -> str:
-    try:
-        import vllm
-    except Exception:
-        return ""
-    return os.path.join(
-        os.path.dirname(vllm.__file__),
-        "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py",
-    )
-
-
-OLD1 = '''        assert self.kv_cache_shape is not None, "KV caches shape not initialized"
-        is_mla = len(self.kv_cache_shape) == 3
-        stride = self.kv_caches[layer_name].stride()
-        sz = self.kv_caches[layer_name].element_size()
-        if is_mla:
-            blknum, blksize, hs = self.kv_cache_shape
-            hn = 1
-            block_stride = stride[0]
-        else:
-            _, blknum, blksize, hn, hs = self.kv_cache_shape'''
-
-NEW1 = '''        # [PATCHED heterogeneous-kv] Use this layer's own shape so caches with a
-        # different rank/dtype (MiniMax-M3: bf16 key-only rank-3 index cache vs
-        # fp8 K+V rank-5 main cache) are sized per-layer, not from the first cache.
-        layer_shape = tuple(self.kv_caches[layer_name].shape)
-        assert layer_shape, "KV caches shape not initialized"
-        is_mla = len(layer_shape) == 3
-        stride = self.kv_caches[layer_name].stride()
-        sz = self.kv_caches[layer_name].element_size()
-        if is_mla:
-            blknum, blksize, hs = layer_shape
-            hn = 1
-            block_stride = stride[0]
-        else:
-            _, blknum, blksize, hn, hs = layer_shape'''
-
-OLD2 = '''        first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0]
-        offs = self._compute_block_transfer_offsets(
-            first_layer, local_block_ids, remote_block_ids, remote_moriio_meta
-        )
-
-        for layer_name in self.layer_name_to_local_kv_cache_metadata:
-            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
-                layer_name
-            )
-            # TODO : apply multi-session batch-read when moriio support it
-            transfer_status = self.moriio_wrapper.read_remote_data(
-                offs[2], offs[0], offs[1], sessions[sess_idx]
-            )'''
-
-NEW2 = '''        for layer_name in self.layer_name_to_local_kv_cache_metadata:
-            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
-                layer_name
-            )
-            # [PATCHED heterogeneous-kv] Per-layer offsets so the bf16 key-only
-            # MiniMax-M3 index cache is transferred with its own geometry instead
-            # of the first (main fp8 K+V) layer's.
-            offs = self._compute_block_transfer_offsets(
-                layer_name, local_block_ids, remote_block_ids, remote_moriio_meta
-            )
-            # TODO : apply multi-session batch-read when moriio support it
-            transfer_status = self.moriio_wrapper.read_remote_data(
-                offs[2], offs[0], offs[1], sessions[sess_idx]
-            )'''
-
-
-def main() -> int:
-    target = sys.argv[1] if len(sys.argv) > 1 else _default_target()
-    if not target or not os.path.isfile(target):
-        print(f"[PATCH] moriio_connector.py not found ({target!r}); skipping")
-        return 0
-    src = open(target).read()
-    if "PATCHED heterogeneous-kv" in src:
-        print("[PATCH] moriio heterogeneous-kv already applied")
-        return 0
-    if OLD1 not in src:
-        print("[PATCH] WARN: _compute_block_transfer_offsets pattern not found; "
-              "connector version changed — skipping (no-op)")
-        return 0
-    if OLD2 not in src:
-        print("[PATCH] WARN: _read_blocks pattern not found; "
-              "connector version changed — skipping (no-op)")
-        return 0
-    src = src.replace(OLD1, NEW1, 1).replace(OLD2, NEW2, 1)
-    # Validate it still compiles before writing.
-    try:
-        compile(src, target, "exec")
-    except SyntaxError as e:
-        print(f"[PATCH] ERROR: patched source fails to compile: {e}")
-        return 1
-    open(target, "w").write(src)
-    print("[PATCH] Applied: moriio heterogeneous-kv per-layer transfer "
-          "(MiniMax-M3 sparse index cache)")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh
index 3e5d82c0c..35eaf17dc 100644
--- a/benchmarks/multi_node/amd_utils/setup_deps.sh
+++ b/benchmarks/multi_node/amd_utils/setup_deps.sh
@@ -185,28 +185,6 @@ install_transformers_glm5() {
     _SETUP_INSTALLED+=("transformers-glm5")
 }
 
-# ---------------------------------------------------------------------------
-# vLLM: Patch MoRIIOConnector for heterogeneous (hybrid sparse-attn) KV caches.
-#
-# MiniMax-M3 registers a bf16 key-only rank-3 lightning-indexer cache alongside
-# the fp8 K+V rank-5 main cache. Upstream MoRIIO derives one uniform block
-# geometry from the first cache and reuses the first layer's transfer offsets
-# for every layer, corrupting the index cache on the decode worker -> garbage
-# output (gsm8k ~= 0). The overlay makes the READ path compute geometry/offsets
-# per layer. Idempotent; no-op on connector versions that don't match.
-# See patches/moriio_heterogeneous_kv.py and patches/README.md.
-# ---------------------------------------------------------------------------
-patch_moriio_heterogeneous_kv() {
-    local patcher
-    patcher="$(dirname "${BASH_SOURCE[0]}")/patches/moriio_heterogeneous_kv.py"
-    if [[ ! -f "$patcher" ]]; then
-        echo "[SETUP] moriio heterogeneous-kv patcher not found, skipping"
-        return 0
-    fi
-    python3 "$patcher" || echo "[SETUP] WARN: moriio heterogeneous-kv patch returned non-zero"
-    _SETUP_INSTALLED+=("moriio-heterogeneous-kv")
-}
-
 # =============================================================================
 # Run installers (engine-gated)
 # =============================================================================
@@ -214,7 +192,6 @@ patch_moriio_heterogeneous_kv() {
 if [[ "$ENGINE" == "vllm-disagg" ]]; then
     install_recipe_deps
     install_amd_quark
-    patch_moriio_heterogeneous_kv
 
     # =========================================================================
     # vLLM: Export UCX/RIXL paths (persists since this file is sourced)

From 815c78c3594ff9626937ff4816265282da4f9518 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Wed, 24 Jun 2026 12:55:22 +0000
Subject: [PATCH 19/20] Use upstream nightly image for MiniMax-M3 disagg, drop
 MoRIIO overlay

- Co-work with Gupta, Ravi

All three MoRIIO fixes the in-tree overlay carried have merged upstream and now
ship in the ROCm nightly image:
  - vLLM #46039  READ-mode mixed KV-layout (axis-aware per-layer offsets)
  - vLLM #46290  WRITE-mode per-geometry offset caching
  - vLLM #46332  heterogeneous-TP rank mapping + ACK fan-in

Point minimaxm3-fp8-mi355x-vllm-disagg at
vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15
(vLLM 0.23.1rc1.dev363+g556bc4e3a, which contains all three merges) and remove
the stop-gap overlay:
  - delete patches/moriio/moriio-minimax-m3-disagg.diff
  - drop the job.slurm in-container auto-apply block (+ MORIIO_KV_PATCH gate)
  - trim the moriio/ section from patches/README.md

Verified on the nightly image with NO patch across all four P/D layouts x
conc {1,4,8}, gsm8k strict/flexible 0.95-0.97 (1P8+1D8, 1P4+1D8, 1P4+1D4,
2P4+1D8) -- matching the previously-patched results.

Refs #1762.
---
 .github/configs/amd-master.yaml               |   2 +-
 benchmarks/multi_node/amd_utils/job.slurm     |  49 --
 .../multi_node/amd_utils/patches/README.md    | 105 +---
 .../moriio/moriio-minimax-m3-disagg.diff      | 483 ------------------
 4 files changed, 6 insertions(+), 633 deletions(-)
 delete mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 8e017fa96..38908ec86 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2737,7 +2737,7 @@ minimaxm3-fp8-mi325x-vllm-mtp:
 # TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in
 # benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8).
 minimaxm3-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:minimax-m3
+  image: vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 8a1fda4c8..977bcaecc 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -80,54 +80,6 @@ if [[ "${MORI_CONN_PATCH:-auto}" != "skip" ]] \
     export EXTRA_DOCKER_MOUNTS
     echo "[job.slurm] auto-applied MoRI conn.py overlay: ${_MORI_PATCH_FILE}"
 fi
-
-# ── In-tree vLLM MoRIIO patch: auto-apply for known-affected images ──
-# The vLLM MoRIIOConnector (image vllm/vllm-openai-rocm:minimax-m3) ships a
-# transposed-KV-layout bug: it assumes the FlashAttention layout
-# [2, num_blocks, ...] (K/V axis outer) but this vLLM's backends allocate
-# [num_blocks, 2, ...] (K/V axis inner), so every disagg block transfer reads
-# the wrong region. Invisible to throughput, but corrupts GQA/non-MLA accuracy
-# (MiniMax-M3 gsm8k 0.0008 -> 0.958). Fix ships as a unified diff (see
-# patches/moriio/ and patches/README.md), applied to the vLLM package dir
-# inside the container at startup, ahead of the server launch below.
-#
-# Auto-applied when the image tag contains "minimax-m3" (and not the already-
-# fixed "-hetkv" rebuild), unless the caller sets MORIIO_KV_PATCH=skip. The
-# repo is already bind-mounted at DOCKER_MOUNT_PATH ("/workspace"), so the
-# diff needs no extra mount -- just an in-container `patch` call. A failed
-# apply aborts the container: silently running unpatched would silently
-# corrupt accuracy, not just skip a feature.
-#
-# The single diff bundles three layered fixes (all in patches/moriio/,
-# see patches/README.md):
-#   1. KV-layout: the load-bearing accuracy fix (axis-aware per-layer offsets;
-#      gsm8k 0.0008 -> 0.958). Required for homogeneous TP too.
-#   2. heterogeneous-TP (no-op for homogeneous TP, required for
-#      PREFILL_TP_SIZE != DECODE_TP_SIZE -- see nvidia/amd-master.yaml's TP4+TP8
-#      configs): handshake/notify port addressing maps each decode rank to the
-#      correct prefill rank instead of its own raw tp_rank (stock
-#      MoRIIOConnector has no fan-out concept at all), and guards (fail loud)
-#      the KV-head-split / prefill-TP>decode-TP cases MoRIIO can't serve.
-#   3. dup-ack: with DECODE_TP_SIZE > PREFILL_TP_SIZE, N decode ranks fan in to
-#      1 prefill rank and each sends its own completion ack for the same
-#      transfer_id. Freeing KV blocks on the first ack (the original
-#      MoRIIOConnector behavior) both crashes EngineCore on the late second ack
-#      (AssertionError in Scheduler._update_from_kv_xfer_finished) and risks
-#      silently corrupting KV if the slower decode rank's read is still in
-#      flight when the blocks are reused. Fix mirrors NIXL's
-#      consumer_notification_counts_by_req: producer counts acks per
-#      transfer_id (consumer embeds its own tp_size in the notify message) and
-#      only frees once all expected consumers have acked.
-_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff"
-_MORIIO_PATCH_CMD=""
-if [[ "${MORIIO_KV_PATCH:-auto}" != "skip" ]] \
-   && [[ -f "$_MORIIO_DIFF" ]] \
-   && [[ "${DOCKER_IMAGE_NAME:-}" == *"minimax-m3"* ]] \
-   && [[ "${DOCKER_IMAGE_NAME:-}" != *"hetkv"* ]]; then
-    _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff || exit 1"
-    echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout + heterogeneous-TP + dup-ack diff inside container: ${_MORIIO_DIFF}"
-fi
-
 xP="${xP:-1}"
 yD="${yD:-1}"
 
@@ -640,7 +592,6 @@ fi
     \"$DOCKER_IMAGE_NAME\" bash -lc '
         set -o pipefail
         mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'
-        ${_MORIIO_PATCH_CMD:-}
         '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log
     '
 
diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md
index 4941540e3..765d571b2 100644
--- a/benchmarks/multi_node/amd_utils/patches/README.md
+++ b/benchmarks/multi_node/amd_utils/patches/README.md
@@ -8,18 +8,16 @@ block our benchmark + accuracy configs — so we can keep reusing the
 
 - `mori_conn.py` — single-file overlay (bind-mounted) for the **sglang**
   MoRI backend.
-- `moriio/` — unified-diff overlay (applied with `patch` at container
-  startup) for the **vLLM** MoRIIO connector (`minimax-m3` image). See its
-  section below.
+
+> Note: the vLLM MoRIIO `minimax-m3` overlay (`moriio/`) was retired once the
+> upstream fixes (vLLM #46039 / #46290 / #46332) shipped in the ROCm nightly
+> image; `minimaxm3-fp8-mi355x-vllm-disagg` now runs the stock nightly directly.
 
 The `mori_conn.py` overlay is wired through the `EXTRA_DOCKER_MOUNTS` env
 var that `job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after
 the existing `-v` block). The local-test driver scripts under
 `scripts/sglang_disagg/` pre-set this env var to the path of the relevant
-overlay; CI runners that need the patch can do the same. The `moriio/`
-diff needs no extra mount — the repo (and thus the diff file) is already
-bind-mounted into the container — `job.slurm` just runs `patch` against it
-before launching the server; see "How to enable" in its section below.
+overlay; CI runners that need the patch can do the same.
 
 ## `mori_conn.py`
 
@@ -82,99 +80,6 @@ When this env var is unset (CI default for runs that don't need the
 patch), `${EXTRA_DOCKER_MOUNTS:-}` expands to the empty string and
 container behavior is byte-identical to the unpatched path.
 
-## `moriio/` (vLLM MoRIIO connector, MiniMax-M3)
-
-A single unified diff (`moriio-minimax-m3-disagg.diff`), applied with
-`patch -p1` against the vLLM package dir inside the container, touching
-three files:
-
-```
-/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/moriio/
-  ├── moriio_connector.py
-  ├── moriio_engine.py
-  └── moriio_common.py
-```
-
-Source: forked from the stock `vllm/vllm-openai-rocm:minimax-m3` image
-(vLLM `0.22.1rc1.dev490`).
-
-**Bug (general MoRIIO, not M3-specific):** the connector assumed the
-FlashAttention KV layout `[2, num_blocks, block_size, heads, head_dim]`
-(K/V axis **outer**), but this vLLM's attention backends (standard
-`TRITON_ATTN` **and** the M3 sparse backend) allocate
-`[num_blocks, 2, block_size, heads, head_dim]` (K/V axis **inner**).
-`_compute_block_transfer_offsets` indexed blocks with `stride[1]` (the
-K/V stride) instead of `stride[0]` (the block stride), so every disagg
-block transfer read the wrong region. Invisible to throughput
-benchmarks (they don't check output); only the **gsm8k accuracy eval**
-catches it. The connector was only ever correct for MLA models
-(DeepSeek, rank-3 path); MiniMax-M3 is GQA + sparse lightning-indexer
-→ broken (disagg gsm8k `0.0008` token salad).
-
-**Fix** — axis-aware offset computation: detect the block axis + optional
-size-2 K/V axis from each layer's real shape/stride, compute offsets per
-distinct geometry (handles M3's 2nd geometry, the rank-3 bf16 key-only
-indexer cache), `num_blocks = shape[0]`; the WRITE path memoizes offsets
-per geometry. Result: disagg gsm8k `strict-match 0.9583 /
-flexible-extract 0.9575` (matches single-node). Homogeneous models
-(uniform layout) are unaffected — one geometry, one offset set, same
-result. Full write-up in
-`/apps/ditian12/m3_disagg_manual/moriio_hetkv_fix/README.md`.
-
-The diff also bundles two heterogeneous-TP layers (no-op for homogeneous
-TP, exercised by `nvidia/amd-master.yaml`'s TP4-prefill + TP8-decode
-configs):
-
-- **heterogeneous-TP addressing + guard:** stock MoRIIOConnector always
-  addresses remote rank == local `tp_rank`, which has no listener once
-  `DECODE_TP_SIZE > PREFILL_TP_SIZE`. `_remote_tp_rank` maps each decode
-  rank to the correct single prefill rank. Two regimes, both requiring
-  **replicated** KV heads (`tp_size >= total_kv_heads`, ≤1 distinct head
-  per rank — MiniMax-M3 has 4 KV heads, so any TP≥4 is replicated):
-  - `D-TP > P-TP` (e.g. P4/D8): `tp_rank // ratio`, mirroring NIXL's
-    `TpKVTopology.get_target_remote_ranks`. Multiple decode ranks read
-    from one prefill rank.
-  - `P-TP > D-TP` (e.g. P8/D4): vLLM distributes heads across prefill
-    ranks in consecutive pairs — (rank0,rank1)→head0, (rank2,rank3)→head1,
-    etc. Decode rank k must connect to the **first** rank of its head group:
-    `tp_rank * ratio`. Using `tp_rank` directly (as the original patch did)
-    is wrong for ranks > 0: decode rank 1 lands on prefill rank 1 (holds
-    head0) instead of prefill rank 2 (holds head1), producing garbage KV.
-  The one unsupported case — KV-head **splitting** (`total_kv_heads >
-  prefill_tp`, where each prefill rank holds a distinct head subset that
-  a decode rank would need to slice from NHD layout, unrepresentable as a
-  single `(offset,len)` per block) — **raises `NotImplementedError`** in
-  `_compute_block_transfer_offsets`. (NIXL likewise only splits heads in
-  HND layout and raises otherwise.)
-- **dup-ack fan-in:** with `DECODE_TP_SIZE > PREFILL_TP_SIZE`, N decode
-  ranks read from one prefill rank and each ACKs the same `transfer_id`.
-  The producer now counts ACKs per `transfer_id` (consumer embeds its own
-  `tp_size` in the notify payload) and only reports `finished_sending`
-  once all expected consumers have ACKed — preventing both the late-ACK
-  `EngineCore` crash and freeing/reusing KV blocks while a slower decode
-  rank is still reading. Mirrors NIXL's
-  `consumer_notification_counts_by_req`.
-
-### How to enable
-
-`job.slurm` auto-applies this diff when `DOCKER_IMAGE_NAME` contains
-`minimax-m3` (and not the already-fixed `-hetkv` rebuild), unless the
-caller sets `MORIIO_KV_PATCH=skip`. To wire it by hand (e.g. the
-`m3_disagg_manual/run_manual_2node.sh` driver, which sets
-`MORIIO_KV_PATCH`), run inside the container before the server starts:
-
-```bash
-patch -p1 -d /usr/local/lib/python3.12/dist-packages \
-  < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
-```
-
-(`$DI_REPO_DIR` is the InferenceX checkout root that `job.slurm` already
-mounts into the container at `/workspace`.)
-
-This lets the **stock** `minimax-m3` image be reused for the E2E
-accuracy run — no `-hetkv` rebuild needed. Retire the overlay once the
-fix lands in a published image; it is not yet upstreamed.
-
 ## When to use which patch
 
 | Image / version | Need `mori_conn.py` overlay? |
diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
deleted file mode 100644
index 83ae80d13..000000000
--- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff
+++ /dev/null
@@ -1,483 +0,0 @@
---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py
-@@ -80,6 +80,10 @@
-     writes_done: int = 0
-     decode_dp_rank: int = 0
-     transfer_offset: tuple[list[int], list[int], list[int]] | None = None
-+    # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for
-+    # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a
-+    # single entry. See MoRIIOWriter._prepare_transfer_plan.
-+    transfer_offsets: dict = field(default_factory=dict)
- 
- 
- class ROLE(Enum):
---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
-@@ -740,6 +740,21 @@
-         # Completions that arrived before transfer_id_to_request_id was populated.
-         # Retried each step until the mapping is established.
-         self._unmatched_write_completions: set[str] = set()
-+        # Producer side: with heterogeneous-TP P/D (DECODE_TP_SIZE >
-+        # PREFILL_TP_SIZE), more than one decode rank reads this rank's KV
-+        # slice and each sends its own completion notify for the SAME
-+        # transfer_id once its own read finishes. We must not report
-+        # finished_sending (which lets the core scheduler free/reuse the
-+        # blocks) until *all* of them have acked -- otherwise a still-in-
-+        # flight slower reader can read corrupted/reused memory. This counts
-+        # per-transfer_id notifies against the expected fan-in count (derived
-+        # from the consumer's own tp_size, sent alongside the transfer_id --
-+        # see send_notify call sites) and only resolves once it's complete.
-+        # Mirrors NIXL's consumer_notification_counts_by_req. Pruned in
-+        # start_load_kv() once a transfer_id drops out of the live mapping
-+        # (e.g. force-freed by the scheduler's defer_timeout without ever
-+        # reaching full count).
-+        self._consumer_notification_counts: dict[str, int] = {}
- 
-         role = "producer" if self.is_producer else "consumer"
-         engine_suffix = (
-@@ -1085,6 +1100,44 @@
-                         req_id.decode(),
-                     )
- 
-+    def _remote_tp_rank(self, remote_tp_size: int) -> int:
-+        """Map this worker's local tp_rank to the single remote tp_rank it must
-+        address when local and remote TP sizes differ (heterogeneous-TP P/D).
-+
-+        Two regimes (both require KV heads to be REPLICATED, not split -- see
-+        guard in _compute_block_transfer_offsets):
-+
-+        * decode-TP > prefill-TP (e.g. P4/D8): multiple decode ranks (in groups
-+          of ratio = decode_tp // prefill_tp) share one prefill rank's KV slice.
-+          floor(local_tp_rank / ratio) maps each decode rank to its prefill rank.
-+          Mirrors NIXL TpKVTopology.get_target_remote_ranks.
-+        * prefill-TP > decode-TP (e.g. P8/D4): vLLM distributes 4 KV heads
-+          across 8 prefill ranks in consecutive pairs -- (rank0,rank1)→head0,
-+          (rank2,rank3)→head1, etc. Each decode rank must address the FIRST rank
-+          of its paired group: local_tp_rank * ratio (NOT the same-indexed rank,
-+          which would land in the wrong head's group for ranks > 0).
-+          Head-splitting is rejected in _compute_block_transfer_offsets.
-+        """
-+        if remote_tp_size == self.world_size:
-+            return self.tp_rank
-+        if remote_tp_size > self.world_size:
-+            # Prefill-TP > decode-TP (e.g. P8/D4, replicated KV heads).
-+            # vLLM pairs prefill ranks per head: decode rank k must connect to
-+            # the first prefill rank of its head group (k * ratio), NOT rank k.
-+            # Example (P8/D4, 4 KV heads): decode rank 1 (head1) → prefill
-+            # rank 2 (not rank 1, which holds head0 alongside rank 0).
-+            assert remote_tp_size % self.world_size == 0, (
-+                f"remote tp_size {remote_tp_size} must be a multiple of local "
-+                f"tp_size {self.world_size} for heterogeneous-TP P/D"
-+            )
-+            return self.tp_rank * (remote_tp_size // self.world_size)
-+        # Decode-TP > prefill-TP: floor-map multiple decode ranks to one prefill rank.
-+        assert self.world_size % remote_tp_size == 0, (
-+            f"local tp_size {self.world_size} must be a multiple of remote "
-+            f"tp_size {remote_tp_size} for heterogeneous-TP P/D"
-+        )
-+        return self.tp_rank // (self.world_size // remote_tp_size)
-+
-     def _moriio_handshake(
-         self,
-         host: str,
-@@ -1101,7 +1150,9 @@
-         # a hack to keep us moving. We will switch when moving to etcd
-         # or where we have a single ZMQ socket in the scheduler.
- 
--        port_offset = get_port_offset(remote_dp_rank, self.tp_rank)
-+        port_offset = get_port_offset(
-+            remote_dp_rank, self._remote_tp_rank(remote_tp_size)
-+        )
-         path = make_zmq_path("tcp", host, port + port_offset)
-         logger.debug("handshake Querying metadata on path: %s", path)
- 
-@@ -1233,8 +1284,10 @@
-             block_size, kv_latent_dim = block_shape
-             self.slot_size_bytes = kv_elem_size * kv_latent_dim
-         else:
--            # [2 (k and v), num_blocks, ...]
--            self.num_blocks = first_kv_cache.shape[1]
-+            # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V
-+            # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read
-+            # shape[1] here, which is the size-2 K/V axis, not num_blocks.)
-+            self.num_blocks = first_kv_cache.shape[0]
-             block_rank = 3  # [block_size, kv_heads, head_dim]
-             block_shape = first_kv_cache.shape[-block_rank:]
-             block_size, n_kv_heads, head_dim = block_shape[-3:]
-@@ -1257,10 +1310,17 @@
-         caches_data = []
- 
-         for cache_or_caches in kv_caches.values():
--            cache_list = [cache_or_caches] if use_mla else cache_or_caches
-+            # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs
-+            # rank-5 (full attention, [K, V]). A single global use_mla flag
-+            # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for
-+            # hybrid models, so detect per cache. region_len is the actual tensor
-+            # (or K/V half) byte size -- equivalent to num_blocks * block_len for
-+            # homogeneous models, correct for heterogeneous ones.
-+            cache_is_mla = cache_or_caches.dim() == 3
-+            cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches
-             for cache in cache_list:
-                 base_addr = cache.data_ptr()
--                region_len = self.num_blocks * self.block_len
-+                region_len = cache.numel() * cache.element_size()
-                 caches_data.append((base_addr, region_len, cache.device.index, ""))
-                 kv_caches_base_addr.append(base_addr)
- 
-@@ -1338,13 +1398,45 @@
-         done_sending, done_recving = set(), set()
- 
-         if self.is_producer:
--            # pop_finished_req_ids returns transfer_ids (the ZMQ payload sent
--            # by decode via send_notify); map back to req_ids for the scheduler.
--            finished_transfer_ids = self.moriio_wrapper.pop_finished_req_ids()
-+            # pop_finished_req_ids returns every completion message received
-+            # since the last call (NOT deduped -- with heterogeneous-TP
-+            # fan-out, two different decode ranks legitimately send
-+            # byte-identical messages for the same transfer_id and each one
-+            # must be counted). Payload is "<transfer_id>:<consumer_tp_size>"
-+            # (see send_notify call sites); plain transfer_id with no ":" is
-+            # treated as a 1:1 ack (internal WRITE-mode completions).
-+            finished_transfer_msgs = self.moriio_wrapper.pop_finished_req_ids()
-+            resolved_transfer_ids: set[str] = set()
-+            for raw_msg in finished_transfer_msgs:
-+                xfer_id, _, tp_size_str = raw_msg.rpartition(":")
-+                if not xfer_id:
-+                    xfer_id, tp_size_str = raw_msg, str(self.world_size)
-+                if xfer_id not in self.transfer_id_to_request_id:
-+                    logger.warning(
-+                        "Could not find %s in transfer_id_to_request_id "
-+                        "lookup table. This could lead to a possible hang.",
-+                        xfer_id,
-+                    )
-+                    continue
-+                consumer_tp_size = int(tp_size_str)
-+                if consumer_tp_size > self.world_size:
-+                    assert consumer_tp_size % self.world_size == 0, (
-+                        f"consumer tp_size {consumer_tp_size} must be a "
-+                        f"multiple of producer tp_size {self.world_size} "
-+                        "for heterogeneous-TP P/D"
-+                    )
-+                    expected_acks = consumer_tp_size // self.world_size
-+                else:
-+                    expected_acks = 1
-+                count = self._consumer_notification_counts.get(xfer_id, 0) + 1
-+                if count >= expected_acks:
-+                    self._consumer_notification_counts.pop(xfer_id, None)
-+                    resolved_transfer_ids.add(xfer_id)
-+                else:
-+                    self._consumer_notification_counts[xfer_id] = count
-             done_sending = {
-                 self.transfer_id_to_request_id[xfer_id]
--                for xfer_id in finished_transfer_ids
--                if xfer_id in self.transfer_id_to_request_id
-+                for xfer_id in resolved_transfer_ids
-             }
-         else:
-             if self.mode == MoRIIOMode.WRITE:
-@@ -1389,7 +1481,13 @@
-                 if last.Succeeded():
-                     host, port, xfer_id = self._recving_transfers_callback_addr[req_id]
-                     done_req_ids.add(xfer_id)
--                    self.moriio_wrapper.send_notify(xfer_id, host, port)
-+                    # Embed our own tp_size so the producer can tell, with
-+                    # heterogeneous-TP fan-out, how many consumer acks to
-+                    # expect for this transfer_id before it's safe to free
-+                    # the blocks (see _consumer_notification_counts).
-+                    self.moriio_wrapper.send_notify(
-+                        f"{xfer_id}:{self.world_size}", host, port
-+                    )
-                     to_remove.append(req_id)
-                 elif last.Failed():
-                     logger.error(
-@@ -1402,7 +1500,9 @@
-                     )
-                     host, port, xfer_id = self._recving_transfers_callback_addr[req_id]
-                     try:
--                        self.moriio_wrapper.send_notify(xfer_id, host, port)
-+                        self.moriio_wrapper.send_notify(
-+                            f"{xfer_id}:{self.world_size}", host, port
-+                        )
-                     except Exception:
-                         logger.exception(
-                             "Failed to send error notification for request %s",
-@@ -1488,6 +1588,15 @@
-         """
-         self.transfer_id_to_request_id = metadata.transfer_id_to_request_id
-         if self.is_producer:
-+            # Drop counts for transfer_ids that dropped out of the live
-+            # mapping without ever reaching full ack count (e.g. force-freed
-+            # by the scheduler's defer_timeout) -- they can never resolve via
-+            # get_finished() anymore, so stop tracking them to bound memory.
-+            self._consumer_notification_counts = {
-+                xfer_id: count
-+                for xfer_id, count in self._consumer_notification_counts.items()
-+                if xfer_id in self.transfer_id_to_request_id
-+            }
-             self.moriio_wrapper.async_wait_reqid()
-             return
-         if self.mode == MoRIIOMode.WRITE:
-@@ -1560,6 +1669,7 @@
-             remote_block_ids=meta.remote_block_ids,
-             remote_host=meta.remote_host,
-             remote_notify_port=meta.remote_notify_port,
-+            remote_tp_size=meta.tp_size,
-         )
- 
-     def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_layer):
-@@ -1653,6 +1763,7 @@
-         local_block_ids: list[int],
-         remote_block_ids: list[int],
-         remote_moriio_meta: MoRIIOAgentMetadata,
-+        remote_tp_size: int | None = None,
-     ) -> tuple[list[int], list[int], list[int]]:
-         """Compute transfer offsets for block data.
- 
-@@ -1661,25 +1772,100 @@
-             local_block_ids: IDs of local blocks
-             remote_block_ids: IDs of remote blocks
-             remote_moriio_meta: Metadata of the remote MoRIIO agent
-+            remote_tp_size: tp_size of the remote (producer/prefill) instance.
-+                Defaults to this worker's world_size (homogeneous P/D TP). When
-+                it differs, used to validate that KV heads are replicated (the
-+                only heterogeneous-TP regime MoRIIO supports) -- see the guard
-+                below.
-         Returns:
-             Tuple of (local_offsets, remote_offsets, transfer_sizes)
-         """
-         assert self.kv_cache_shape is not None, "KV caches shape not initialized"
--        is_mla = len(self.kv_cache_shape) == 3
--        stride = self.kv_caches[layer_name].stride()
--        sz = self.kv_caches[layer_name].element_size()
--        if is_mla:
--            blknum, blksize, hs = self.kv_cache_shape
--            hn = 1
--            block_stride = stride[0]
--        else:
--            _, blknum, blksize, hn, hs = self.kv_cache_shape
--            local_ktov_stride = stride[0]
--            block_stride = stride[1]
--            remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks
-+        # Per-layer, axis-aware geometry.
-+        #
-+        # The KV tensors vLLM hands the connector are laid out (verified on
-+        # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1):
-+        #   * main attention (GQA, dense + sparse layers):
-+        #       shape (num_blocks, 2, block_size, num_kv_heads, head_dim)
-+        #       -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0.
-+        #   * sparse lightning-indexer (key-only, bf16):
-+        #       shape (num_blocks, block_size, head_dim)  -- rank 3, no K/V axis.
-+        #
-+        # The legacy code assumed the FlashAttention-style [2, num_blocks, ...]
-+        # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V
-+        # stride) -- transposing block vs K/V so every block read the wrong
-+        # region (corruption invisible to throughput-only benchmarks). Instead,
-+        # detect the block axis (size == num_blocks) and the optional K/V axis
-+        # (size 2) from THIS layer's own shape, and derive strides from them. The
-+        # per-block stride is independent of num_blocks, so no remote-num_blocks
-+        # scaling is needed.
-+        layer_cache = self.kv_caches[layer_name]
-+        layer_shape = tuple(layer_cache.shape)
-+        stride = layer_cache.stride()
-+        sz = layer_cache.element_size()
-+        rank = len(layer_shape)
-+
-+        # K/V axis = the size-2 axis among the two outermost dims (if any).
-+        kv_axis: int | None = None
-+        if rank >= 4:
-+            if layer_shape[0] == 2:
-+                kv_axis = 0
-+            elif layer_shape[1] == 2:
-+                kv_axis = 1
-+        # Block axis = outermost non-K/V axis (the one indexed by block_id).
-+        block_axis = 0
-+        if kv_axis == 0:
-+            block_axis = 1
-+        block_stride = stride[block_axis]
-+        kv_stride = stride[kv_axis] if kv_axis is not None else 0
-+        per_block = layer_shape[kv_axis] if kv_axis is not None else 1  # 2 (K,V) or 1
-+
-+        # One transferred slab = all dims except the block and K/V axes.
-+        slot_elems = 1
-+        for ax in range(rank):
-+            if ax == block_axis or ax == kv_axis:
-+                continue
-+            slot_elems *= layer_shape[ax]
-+
-+        # --- Heterogeneous-TP guard (mirrors NIXL add_remote_agent) -----------
-+        # When P/D TP sizes differ, _remote_tp_rank maps each decode rank to a
-+        # single remote rank; that whole-block read is byte-correct only when KV
-+        # heads are REPLICATED on the remote (prefill) side.
-+        #
-+        # Supported regimes (replicated heads, i.e. remote_heads <= local_heads):
-+        #   * D-TP > P-TP (e.g. P4/D8): multiple decode ranks share one prefill
-+        #     rank's slice (floor-ratio mapping).
-+        #   * P-TP > D-TP (e.g. P8/D4): each decode rank reads from same-indexed
-+        #     prefill rank (self.tp_rank mapping). MiniMax-M3's regime: 4 KV heads
-+        #     fully replicated at TP>=4.
-+        #
-+        # Unsupported: heads SPLIT on prefill (remote_heads > local_heads).
-+        # MoRIIO's NHD layout (heads interleaved per token) makes a head slice
-+        # non-contiguous and inexpressible as a single (offset, len) per block.
-+        # NIXL raises for the same reason; we do the same. MLA / rank-3 indexer
-+        # caches are always replicated (no K/V axis) and bypass this guard.
-+        local_tp = self.world_size
-+        remote_tp = remote_tp_size if remote_tp_size is not None else local_tp
-+        if remote_tp != local_tp and not self.use_mla and kv_axis is not None:
-+            total_kv_heads = self.model_config.get_total_num_kv_heads()
-+            remote_heads = max(1, total_kv_heads // remote_tp)
-+            local_heads = max(1, total_kv_heads // local_tp)
-+            if remote_heads > local_heads:
-+                # KV heads are SPLIT on prefill -- whole-block read is incorrect.
-+                # Applies in both TP-mismatch directions; fail loud.
-+                raise NotImplementedError(
-+                    f"Heterogeneous-TP head splitting (total_kv_heads "
-+                    f"{total_kv_heads} > prefill tp_size {remote_tp}: "
-+                    f"{remote_heads} heads/rank on prefill vs {local_heads} on "
-+                    "decode) requires per-head slicing of an NHD KV layout, not "
-+                    "supported by MoRIIOConnector. Use PREFILL_TP_SIZE >= "
-+                    "total_kv_heads so KV heads are replicated."
-+                )
-+            # remote_heads <= local_heads: replicated. _remote_tp_rank selects the
-+            # correct remote rank; whole-block read is byte-correct.
-+
-+        transfer_size_byte = slot_elems * sz
- 
--        transfer_size_byte = blksize * hn * hs * sz
--        per_block = 1 if is_mla else 2
-         total = len(local_block_ids) * per_block
-         offset_local = [0] * total
-         offset_remote = [0] * total
-@@ -1688,17 +1874,9 @@
-         w = 0
-         for i, lb in enumerate(local_block_ids):
-             rb = remote_block_ids[i]
--            # K
--            offset_local[w] = sz * (lb * block_stride)
--            offset_remote[w] = sz * (rb * block_stride)
--            w += 1
--            if not is_mla:
--                # V
--                # Handle num_block variations originating from PD (different kv strides)
--                # TODO: address block_sz differences in heterogeneous TP scenarios
--                # In MLA, we don't need to consider these two cases.
--                offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride)
--                offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride)
-+            for kv in range(per_block):
-+                offset_local[w] = sz * (lb * block_stride + kv * kv_stride)
-+                offset_remote[w] = sz * (rb * block_stride + kv * kv_stride)
-                 w += 1
- 
-         merged_l, merged_r, merged_s = self.merge_contiguous_blocks(
-@@ -1715,6 +1893,7 @@
-         transfer_id: str,
-         remote_host: str,
-         remote_notify_port: int,
-+        remote_tp_size: int,
-     ) -> None:
-         if self.mode == MoRIIOMode.WRITE:
-             return
-@@ -1722,15 +1901,30 @@
-         dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0)
-         sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id)
- 
--        first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0]
--        offs = self._compute_block_transfer_offsets(
--            first_layer, local_block_ids, remote_block_ids, remote_moriio_meta
--        )
--
--        for layer_name in self.layer_name_to_local_kv_cache_metadata:
--            sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index(
--                layer_name
-+        # Heterogeneous-KV models register layers with different shapes/dtypes in
-+        # a single KV-cache group sharing one block table, so block_ids match
-+        # across layers but per-block byte geometry does not. Compute offsets per
-+        # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3
-+        # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing.
-+        layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys())
-+        offs_by_geom: dict = {}
-+        for sess_idx, layer_name in enumerate(layer_names):
-+            layer_cache = self.kv_caches[layer_name]
-+            geom_key = (
-+                tuple(layer_cache.shape),
-+                tuple(layer_cache.stride()),
-+                layer_cache.dtype,
-             )
-+            offs = offs_by_geom.get(geom_key)
-+            if offs is None:
-+                offs = self._compute_block_transfer_offsets(
-+                    layer_name,
-+                    local_block_ids,
-+                    remote_block_ids,
-+                    remote_moriio_meta,
-+                    remote_tp_size=remote_tp_size,
-+                )
-+                offs_by_geom[geom_key] = offs
-             # TODO : apply multi-session batch-read when moriio support it
-             transfer_status = self.moriio_wrapper.read_remote_data(
-                 offs[2], offs[0], offs[1], sessions[sess_idx]
-@@ -1739,6 +1933,6 @@
-                 self._recving_transfers[request_id].append(transfer_status)
-                 self._recving_transfers_callback_addr[request_id] = (
-                     remote_host,
--                    str(remote_notify_port + self.tp_rank),
-+                    str(remote_notify_port + self._remote_tp_rank(remote_tp_size)),
-                     transfer_id,
-                 )
---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
-+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py
-@@ -279,21 +279,33 @@
-         Returns:
-             The transfer plan
-         """
--        # Compute offsets if not cached
--        if request_info.transfer_offset is None:
-+        # Compute offsets per distinct layer geometry. Heterogeneous-KV models
-+        # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8
-+        # caches in one KV-cache group; caching a single offset set per request
-+        # and reusing it for every layer corrupts the indexer cache. Block_ids
-+        # are shared (single block table), so offsets depend only on the layer's
-+        # shape/stride/dtype -- memoize by that geometry key.
-+        layer_cache = self.worker.kv_caches[task.layer_name]
-+        geom_key = (
-+            tuple(layer_cache.shape),
-+            tuple(layer_cache.stride()),
-+            layer_cache.dtype,
-+        )
-+        offsets = request_info.transfer_offsets.get(geom_key)
-+        if offsets is None:
-             offsets = self.worker._compute_block_transfer_offsets(
-                 task.layer_name,
-                 task.local_block_ids,
-                 request_info.block_ids,
-                 remote_moriio_meta,
-             )
--            request_info.transfer_offset = offsets
-+            request_info.transfer_offsets[geom_key] = offsets
- 
-         # Get session index
-         layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys())
-         sess_idx = layer_names.index(task.layer_name)
- 
--        local_off, remote_off, sizes = request_info.transfer_offset
-+        local_off, remote_off, sizes = offsets
- 
-         return LayerTransferPlan(
-             request_id=task.request_id,
-@@ -671,9 +683,14 @@
-             raise
- 
-     def pop_finished_req_ids(self):
--        # producer invocation: get the set of completed requests at the decode
-+        # Producer invocation: get all completion messages received since the
-+        # last call. Returned as a list, NOT deduped -- with heterogeneous-TP
-+        # fan-out, two different decode ranks can send byte-identical
-+        # messages for the same transfer_id, and the caller (get_finished())
-+        # needs to count every individual occurrence to know when all
-+        # expected consumers have acked.
-         with self.lock:
--            done_send = set(self.done_req_ids)
-+            done_send = list(self.done_req_ids)
-             self.done_req_ids = []
-         return done_send
- 

From 38be6bed1f3138b001f99f325e8fd68b3cb27163 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Wed, 24 Jun 2026 18:38:00 -0400
Subject: [PATCH 20/20] fix: append M3 MI355X disagg changelog entry at end of
 file

The minimaxm3-fp8-mi355x-vllm-disagg entry was inserted mid-file (after
the #1862 entry), which violates the append-only changelog gate
("entry 511 changed; existing entries are immutable"). Move it to the
end of perf-changelog.yaml so existing entries stay byte-identical to
main and the new entry is a clean append.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 perf-changelog.yaml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 50ce696af..54fb2d7dd 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4072,18 +4072,6 @@
     - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862
 
-- config-keys:
-    - minimaxm3-fp8-mi355x-vllm-disagg
-  description:
-    - "Initial submission: MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3) — 1 prefill (TP8) + 1 decode (TP8) across conc 1,2,4,8,16, validating the MoRI-IO KV-transfer disagg pipeline end-to-end for M3"
-    - "Layered on the MoRI-IO patch-removal infra (#1585): uses benchmarks/multi_node/amd_utils with the runtime MoRI patches removed"
-    - "Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128 (MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, minimax_m3 parsers; no EP (TP8, MoE experts TP-sharded)"
-    - "M3 disagg script points MODEL_PATH at the cluster's shared HF cache (/it-share/hf-hub-cache) where the ~414 GB MiniMax-M3-MXFP8 checkpoint is pre-staged, instead of the launcher default /it-share/data; scoped to M3 only (other disagg models keep /it-share/data)"
-    - "Sweeps conc 1,2,4,8,16,32,64,128,256,512,1024 at both 1k1k and 8k1k (1P TP8 + 1D TP8). The 8k1k point makes the multi-node eval policy (8k1k + conc >= 16) mark one lm-eval on the highest-max-conc layout (eval-conc=median), validating the disagg pipeline's correctness; run with non-canary-full-sweep-enabled so the eval entry actually runs"
-    - "Adds two asymmetric prefill/decode layouts at both 1k1k and 8k1k alongside the TP8+TP8 sweep: 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1,2,4,8,16,32,64,128,256; and balanced 1P TP4 + 1D TP4 at conc 64,128,256,512,1024. Per-worker TP comes from the master-config prefill/decode tp (server_vllm.sh rewrites the models_vllm.yaml --tensor-parallel-size placeholder); no EP, dp-attn off, PREFILL_NODES=1/DECODE_NODES=1 (TP4 uses half an 8-GPU node)"
-    - "Adds a 2P TP4 + 1D TP8 layout at both 1k1k and 8k1k for high conc 256,512,768,1024: two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an 8-GPU node) feeding one TP8 decode (DECODE_NODES=1); 3 nodes total"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762
-
 - config-keys:
     - dsv4-fp4-mi355x-sglang
   description:
@@ -4165,3 +4153,15 @@
     - "Run the PR #1891 MiniMax-M3 MXFP8 B300 Dynamo-vLLM recipe set on top of current main."
     - "Uses the vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 image and the TEP4/TEP8 8k1k topologies not covered by PR #1890."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1891
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm-disagg
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3) — 1 prefill (TP8) + 1 decode (TP8) across conc 1,2,4,8,16, validating the MoRI-IO KV-transfer disagg pipeline end-to-end for M3"
+    - "Layered on the MoRI-IO patch-removal infra (#1585): uses benchmarks/multi_node/amd_utils with the runtime MoRI patches removed"
+    - "Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128 (MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, minimax_m3 parsers; no EP (TP8, MoE experts TP-sharded)"
+    - "M3 disagg script points MODEL_PATH at the cluster's shared HF cache (/it-share/hf-hub-cache) where the ~414 GB MiniMax-M3-MXFP8 checkpoint is pre-staged, instead of the launcher default /it-share/data; scoped to M3 only (other disagg models keep /it-share/data)"
+    - "Sweeps conc 1,2,4,8,16,32,64,128,256,512,1024 at both 1k1k and 8k1k (1P TP8 + 1D TP8). The 8k1k point makes the multi-node eval policy (8k1k + conc >= 16) mark one lm-eval on the highest-max-conc layout (eval-conc=median), validating the disagg pipeline's correctness; run with non-canary-full-sweep-enabled so the eval entry actually runs"
+    - "Adds two asymmetric prefill/decode layouts at both 1k1k and 8k1k alongside the TP8+TP8 sweep: 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1,2,4,8,16,32,64,128,256; and balanced 1P TP4 + 1D TP4 at conc 64,128,256,512,1024. Per-worker TP comes from the master-config prefill/decode tp (server_vllm.sh rewrites the models_vllm.yaml --tensor-parallel-size placeholder); no EP, dp-attn off, PREFILL_NODES=1/DECODE_NODES=1 (TP4 uses half an 8-GPU node)"
+    - "Adds a 2P TP4 + 1D TP8 layout at both 1k1k and 8k1k for high conc 256,512,768,1024: two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an 8-GPU node) feeding one TP8 decode (DECODE_NODES=1); 3 nodes total"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762