From fe4cb3368ab102c9a70d481f35ee7f4b48cc08df Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 19:12:12 -0400 Subject: [PATCH 01/20] [Klaud Cold] minimaxm3-fp8-mi355x-vllm-disagg: day-zero MoRI-IO disagg smoke test MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3): 1 prefill (TP8) + 1 decode (TP8) at conc 1, validating the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on the MoRI-IO patch-removal infra (#1585): brings in that PR's amd_utils changes (setup_deps.sh / server_vllm.sh / submit.sh / models_vllm.yaml mori -> mori_low_latency) and the two job.slurm hunks (vllm-router image bump nightly-20260511 -> nightly-20260603, drop VLLM_MORIIO_CONNECTOR_READ_MODE env), while keeping main's atom-disagg support intact. Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128 (MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, minimax_m3 tool/reasoning parsers; no EP (TP8, MoE experts TP-sharded as in the single-node M3 TP8 recipe). perf-changelog.yaml and amd-master.yaml contain only M3 changes. Co-Authored-By: Claude Fable 5 --- .github/configs/amd-master.yaml | 37 ++ benchmarks/multi_node/amd_utils/job.slurm | 3 +- .../multi_node/amd_utils/models_vllm.yaml | 15 +- .../multi_node/amd_utils/server_vllm.sh | 6 +- benchmarks/multi_node/amd_utils/setup_deps.sh | 559 +----------------- benchmarks/multi_node/amd_utils/submit.sh | 1 - .../minimaxm3_fp8_mi355x_vllm-disagg.sh | 78 +++ perf-changelog.yaml | 12 + 8 files changed, 145 insertions(+), 566 deletions(-) create mode 100644 benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5e5452c4c..d7433e4d1 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2729,3 +2729,40 @@ minimaxm3-fp8-mi325x-vllm-mtp: - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp } - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp } + +# MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the +# day-zero ROCm image. Minimal 1 prefill (TP8) + 1 decode (TP8) at conc 1 to +# validate the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on +# the MoRI-patch-removal infra (#1585). No EP (TP8 only); MoE experts are +# TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in +# benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8). +minimaxm3-fp8-mi355x-vllm-disagg: + image: vllm/vllm-openai-rocm:minimax-m3 + model: MiniMaxAI/MiniMax-M3-MXFP8 + model-prefix: minimaxm3 + runner: mi355x-disagg + precision: fp8 + framework: vllm-disagg + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 1 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 17f5b4f54..67160c262 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -316,7 +316,7 @@ SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" # vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}" +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260603-e667ebb}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" @@ -401,7 +401,6 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then -e UCX_LOG_LEVEL=warn -e HSA_ENABLE_SDMA=1 -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300} - -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} -e PYTHONPYCACHEPREFIX=/tmp/pycache ) elif [[ "$ENGINE" == "atom-disagg" ]]; then diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index b051de8d9..e78b6c647 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -26,15 +26,15 @@ amd-Llama-3.3-70B-Instruct-FP8-KV: Kimi-K2.5-MXFP4: prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" - decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" + decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" hf_dir: "models--amd--Kimi-K2.5-MXFP4" MiniMax-M2.5: # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup. # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE. - prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" - decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" + decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1" hf_dir: "models--MiniMaxAI--MiniMax-M2.5" @@ -42,3 +42,12 @@ gpt-oss-120b: prefill_flags: "--tensor-parallel-size 8" decode_flags: "--tensor-parallel-size 8" env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" + +MiniMax-M3-MXFP8: + # MiniMax-M3 MXFP8 disagg smoke test (TP8 prefill + TP8 decode, no EP). + # --block-size 128 is mandatory (MSA sparse/index cache); text-only benchmark + # so --language-model-only frees the vision encoder. gfx950 uses FP8 KV cache. + prefill_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice" + decode_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice" + env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_USE_BREAKABLE_CUDAGRAPH=0 VLLM_ENGINE_READY_TIMEOUT_S=3600" + hf_dir: "models--MiniMaxAI--MiniMax-M3-MXFP8" diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index d61fe0359..f02b1cd56 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -256,7 +256,7 @@ if [ "$NODE_RANK" -eq 0 ]; then --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -422,7 +422,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \ ${PREFILL_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then @@ -478,7 +478,7 @@ else --served-model-name ${SERVED_MODEL} \ --port $SERVER_PORT \ --trust-remote-code \ - --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \ + --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \ ${DECODE_SERVER_CONFIG}" if [[ "$DRY_RUN" -eq 1 ]]; then diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index add2e3fa5..35eaf17dc 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -3,8 +3,8 @@ # setup_deps.sh — Install missing disagg dependencies at container start. # # Dispatched by $ENGINE (set by server.sh dispatcher): -# vllm-disagg -> vLLM/MoRI-IO patches + UCX/RIXL path exports -# (base image: vllm/vllm-openai-rocm:v0.18.0) +# vllm-disagg -> recipe deps + amd-quark + UCX/RIXL path exports +# (base image: vllm/vllm-openai-rocm:nightly) # sglang-disagg -> SGLang aiter gluon patch + per-model installs # (base image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-*) # @@ -79,556 +79,6 @@ install_amd_quark() { _SETUP_INSTALLED+=("amd-quark") } -# --------------------------------------------------------------------------- -# 8. Patch vLLM MoRI-IO save_kv_layer busy-spin (C128 tail-batch deadlock) -# In WRITE mode, save_kv_layer spins forever waiting for the handshake -# callback to set write_ready_flags. This blocks the model worker thread, -# preventing it from responding to EngineCore shm_broadcast, causing a -# TimeoutError cascade and crash. -# Patch: add time.sleep(0.001) and a 30s timeout to yield CPU and prevent -# the model worker from deadlocking. -# --------------------------------------------------------------------------- -patch_moriio_save_kv_timeout() { - python3 -c ' -import os, sys - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc - f = mc.__file__ - src = open(f).read() - - # Already patched? - if "[PATCHED] save_kv_layer timeout" in src: - print("[SETUP] save_kv_layer timeout patch already applied") - sys.exit(0) - - old = """ while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.write_ready_flags - ): - continue""" - - if old not in src: - print("[SETUP] WARN: save_kv_layer busy-spin pattern not found, skipping patch") - sys.exit(0) - - new = """ # [PATCHED] save_kv_layer — null guard + timeout + sleep - if remote_engine_id is None: - return - import time as _time, os as _os - _wait_start = _time.monotonic() - _SAVE_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) - while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.write_ready_flags - ): - _elapsed = _time.monotonic() - _wait_start - if _elapsed > _SAVE_KV_TIMEOUT: - import logging as _logging - _logging.getLogger("vllm.moriio").warning( - "[HANGFIX] save_kv_layer: timeout (%.1fs) waiting for " - "write_ready_flags[%s], breaking to unblock model " - "worker", _elapsed, remote_engine_id) - break - _time.sleep(0.001) - continue""" - - new_src = src.replace(old, new) - if new_src == src: - print("[SETUP] WARN: replacement had no effect") - sys.exit(0) - - open(f, "w").write(new_src) - print("[SETUP] Patched save_kv_layer: null guard + timeout + sleep") -except Exception as e: - print(f"[SETUP] WARN patch save_kv_layer: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-save-kv-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 9. Patch MoRIIO waiting_for_transfer_complete with bounded timeout -# The original status.Wait() blocks forever if an RDMA completion never -# arrives (e.g., NIC queue saturation at C256). This replaces the unbounded -# wait with a polling loop using status.Succeeded() + configurable timeout. -# Also adds error handling to the write worker loop so a single failed -# transfer doesn't kill the background thread. -# --------------------------------------------------------------------------- -patch_moriio_transfer_timeout() { - python3 -c ' -import os, sys, textwrap - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine as me - f = me.__file__ - src = open(f).read() - - if "[PATCHED] transfer completion timeout" in src: - print("[SETUP] transfer completion timeout patch already applied") - sys.exit(0) - - # --- Patch 1: Replace waiting_for_transfer_complete with polling + timeout --- - old_wait = """ def waiting_for_transfer_complete(self): - if not self.transfer_status: - return - - transfers_to_wait = [] - with self.lock: - transfers_to_wait = self.transfer_status[:] - self.transfer_status.clear() - - for status in transfers_to_wait: - try: - status.Wait() - if not status.Succeeded(): - logger.error( - "Transfer failed: %s, Code: %s", status.Message(), status.Code() - ) - raise TransferError("MoRIIO transfer failed!") - except Exception as e: - logger.error("Transfer %s failed: %s", status, e) - raise""" - - new_wait = """ def waiting_for_transfer_complete(self): - # [PATCHED] transfer completion timeout — bounded polling loop - import time as _time, os as _os - if not self.transfer_status: - return - - _timeout = float(_os.environ.get("VLLM_MORIIO_TRANSFER_TIMEOUT", "120")) - - transfers_to_wait = [] - with self.lock: - transfers_to_wait = self.transfer_status[:] - self.transfer_status.clear() - - _start = _time.monotonic() - remaining = list(transfers_to_wait) - _polls = 0 - _completed = 0 - - while remaining: - _elapsed = _time.monotonic() - _start - if _elapsed > _timeout: - logger.error( - "[HANGFIX] transfer_timeout elapsed=%.1fs " - "pending=%d/%d completed=%d polls=%d " - "action=raise_transfer_error", - _elapsed, len(remaining), len(transfers_to_wait), - _completed, _polls, - ) - raise TransferError( - f"RDMA transfer timeout after {_elapsed:.1f}s, " - f"{len(remaining)}/{len(transfers_to_wait)} pending" - ) - - still_waiting = [] - for status in remaining: - try: - if status.Succeeded(): - _completed += 1 - continue - still_waiting.append(status) - except Exception as e: - logger.error( - "[HANGFIX] transfer_poll_error error=%s", e) - raise TransferError( - f"Transfer failed during poll: {e}" - ) from e - - remaining = still_waiting - if remaining: - _time.sleep(0.005) - _polls += 1 - if _polls % 2000 == 0: - logger.warning( - "[HANGFIX] transfer_wait pending=%d " - "completed=%d elapsed=%.1fs timeout=%.0fs", - len(remaining), _completed, - _time.monotonic() - _start, _timeout, - )""" - - if old_wait not in src: - print("[SETUP] WARN: waiting_for_transfer_complete pattern not found") - sys.exit(0) - - new_src = src.replace(old_wait, new_wait) - - # --- Patch 2: Add error handling + cleanup to _write_worker_loop --- - old_loop = """ self._execute_write_task(task)""" - - new_loop = """ try: - self._execute_write_task(task) - except Exception as _e: - logger.error( - "[HANGFIX] req=%s write_task_failed error=%s " - "action=cleanup_and_mark_done", - task.request_id, _e, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None - ) - except Exception: - pass""" - - if old_loop in new_src: - new_src = new_src.replace(old_loop, new_loop, 1) - else: - print("[SETUP] WARN: _write_worker_loop pattern not found for error handling") - - # --- Patch 3: Add deferred task timeout to _process_deferred_tasks --- - old_deferred = """ def _process_deferred_tasks(self) -> None: - \"\"\"Process tasks that were previously deferred.\"\"\" - if not self._deferred_tasks: - return - - still_deferred: list[WriteTask] = [] - for task in self._deferred_tasks: - if self._is_remote_ready(task): - self._execute_write_task(task) - else: - still_deferred.append(task) - - self._deferred_tasks = still_deferred""" - - new_deferred = """ def _process_deferred_tasks(self) -> None: - \"\"\"Process tasks that were previously deferred.\"\"\" - # [PATCHED] deferred task timeout — prune stale tasks - import time as _time, os as _os - if not self._deferred_tasks: - return - - _DEFER_TIMEOUT = float( - _os.environ.get("VLLM_MORIIO_DEFER_TIMEOUT", "60")) - - still_deferred: list[WriteTask] = [] - for task in self._deferred_tasks: - _age = _time.monotonic() - getattr(task, "_defer_ts", _time.monotonic()) - if _age > _DEFER_TIMEOUT: - logger.error( - "[HANGFIX] req=%s deferred_task_expired age=%.1fs " - "action=drop_and_mark_done", - task.request_id, _age, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None) - except Exception: - pass - continue - if self._is_remote_ready(task): - try: - self._execute_write_task(task) - except Exception as _e: - logger.error( - "[HANGFIX] req=%s deferred_write_failed error=%s", - task.request_id, _e, - ) - try: - _wr = self.worker.moriio_wrapper - with _wr.lock: - _wr.done_req_ids.append(task.request_id) - _wr.done_remote_allocate_req_dict.pop( - task.request_id, None) - except Exception: - pass - else: - still_deferred.append(task) - - self._deferred_tasks = still_deferred""" - - if old_deferred in new_src: - new_src = new_src.replace(old_deferred, new_deferred, 1) - else: - print("[SETUP] WARN: _process_deferred_tasks pattern not found") - - # --- Patch 4: Stamp defer time when task is deferred --- - old_defer_add = """ self._deferred_tasks.append(task)""" - new_defer_add = """ import time as _time2 - if not hasattr(task, "_defer_ts"): - task._defer_ts = _time2.monotonic() - self._deferred_tasks.append(task)""" - if old_defer_add in new_src: - new_src = new_src.replace(old_defer_add, new_defer_add, 1) - else: - print("[SETUP] WARN: deferred task timestamp patch target not found") - - open(f, "w").write(new_src) - print("[SETUP] Patched: transfer timeout + writer error handling") - -except Exception as e: - print(f"[SETUP] WARN patch transfer_timeout: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-transfer-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 10. Patch MoRIIO start_load_kv busy-spin (same pattern as save_kv_layer) -# The READ-mode spin loop in start_load_kv has the same unbounded-spin -# issue as save_kv_layer. Add timeout + sleep + null guard. -# --------------------------------------------------------------------------- -patch_moriio_load_kv_timeout() { - python3 -c ' -import os, sys - -try: - import vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_connector as mc - f = mc.__file__ - src = open(f).read() - - if "[PATCHED] start_load_kv timeout" in src: - print("[SETUP] start_load_kv timeout patch already applied") - sys.exit(0) - - old = """ while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.load_ready_flag - and wait_handshake_readd_req - ): - continue""" - - if old not in src: - print("[SETUP] WARN: start_load_kv busy-spin pattern not found, skipping") - sys.exit(0) - - new = """ # [PATCHED] start_load_kv timeout — prevent model worker deadlock - if remote_engine_id is None and not wait_handshake_readd_req: - self._reqs_to_send.update(metadata.reqs_to_send) - return - import time as _time, os as _os - _wait_start = _time.monotonic() - _LOAD_KV_TIMEOUT = float(_os.environ.get("VLLM_MORIIO_HANDSHAKE_TIMEOUT", "30")) - while True: - if ( - self._ready_requests.empty() - and remote_engine_id not in self.load_ready_flag - and wait_handshake_readd_req - ): - if _time.monotonic() - _wait_start > _LOAD_KV_TIMEOUT: - import logging as _logging - _logging.getLogger("vllm.moriio").warning( - "[HANGFIX] start_load_kv: timeout (%.1fs) waiting for " - "load_ready_flag[%s]", _time.monotonic() - _wait_start, - remote_engine_id) - break - _time.sleep(0.001) - continue""" - - new_src = src.replace(old, new) - if new_src == src: - print("[SETUP] WARN: start_load_kv replacement had no effect") - sys.exit(0) - - open(f, "w").write(new_src) - print("[SETUP] Patched start_load_kv busy-spin with timeout + sleep") -except Exception as e: - print(f"[SETUP] WARN patch start_load_kv: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("MoRIIO-load-kv-timeout-patch") -} - -# --------------------------------------------------------------------------- -# 11. Fix READ-mode scheduler assertion in _update_from_kv_xfer_finished -# vLLM asserts that a request in finished_recving must be either -# WAITING_FOR_REMOTE_KVS or finished. In READ mode the request can -# transition to RUNNING before the aggregated recv notification arrives, -# crashing the engine with AssertionError. -# (present in v0.17.1 & v0.18.0) -# --------------------------------------------------------------------------- -patch_scheduler_read_mode_fix() { - python3 -c ' -import os, sys - -try: - import vllm.v1.core.sched.scheduler as smod - f = smod.__file__ - src = open(f).read() - - if "[PATCHED] read-mode recv assertion" in src: - print("[SETUP] scheduler read-mode assertion fix already applied") - sys.exit(0) - - old_recv = """ for req_id in kv_connector_output.finished_recving or (): - logger.debug("Finished recving KV transfer for request %s", req_id) - assert req_id in self.requests - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - else: - assert RequestStatus.is_finished(req.status) - self._free_blocks(self.requests[req_id])""" - - new_recv = """ # [PATCHED] read-mode recv assertion — handle intermediate states - for req_id in kv_connector_output.finished_recving or (): - logger.debug("Finished recving KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping recv", req_id) - continue - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - elif RequestStatus.is_finished(req.status): - self._free_blocks(self.requests[req_id]) - else: - logger.debug( - "Request %s recv finished but status=%s (not " - "WAITING_FOR_REMOTE_KVS or finished), skipping " - "block free — will be freed on request completion", - req_id, req.status.name)""" - - if old_recv not in src: - print("[SETUP] WARN: scheduler finished_recving pattern not found, skipping") - sys.exit(0) - - new_src = src.replace(old_recv, new_recv, 1) - - old_send = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - assert req_id in self.requests - self._free_blocks(self.requests[req_id])""" - - new_send = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping send", req_id) - continue - self._free_blocks(self.requests[req_id])""" - - if old_send in new_src: - new_src = new_src.replace(old_send, new_send, 1) - else: - print("[SETUP] WARN: scheduler finished_sending pattern not found") - - open(f, "w").write(new_src) - print("[SETUP] Patched: scheduler _update_from_kv_xfer_finished read-mode fix") - -except Exception as e: - print(f"[SETUP] WARN patch scheduler read-mode: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("scheduler-read-mode-fix") -} - -# --------------------------------------------------------------------------- -# 12. Idle KV block reaper for disaggregated prefill (READ mode) -# The RIXL notification path can lose `finished_sending` signals under -# high concurrency with ibv_post_send failures. This leaves KV blocks -# permanently allocated on the prefill engine even after the decode has -# finished reading. Over multiple benchmark rounds, leaked blocks -# accumulate and eventually saturate the prefill KV cache. -# -# Fix: instrument the scheduler's `schedule()` method to detect idle -# periods (0 running, 0 waiting for >5s) and force-free blocks for -# any remaining requests whose status is finished. -# --------------------------------------------------------------------------- -patch_prefill_idle_kv_reaper() { - python3 -c ' -import os, sys - -try: - import vllm.v1.core.sched.scheduler as smod - f = smod.__file__ - src = open(f).read() - - if "[PATCHED] idle-kv-reaper" in src: - print("[SETUP] idle KV block reaper already applied") - sys.exit(0) - - # Find the _update_from_kv_xfer_finished method end and add reaper logic - # We inject into the method that processes KV transfer completions. - marker = "[PATCHED] read-mode recv assertion" - if marker not in src: - print("[SETUP] WARN: scheduler read-mode patch not found, skipping reaper") - sys.exit(0) - - # Add reaper state initialization to __init__ - old_init_marker = "self.finished_recving_kv_req_ids" - if old_init_marker not in src: - print("[SETUP] WARN: finished_recving_kv_req_ids not found in scheduler") - sys.exit(0) - - # Find the first occurrence to insert reaper state - init_pos = src.find(old_init_marker) - # Find the line containing it - line_end = src.find("\n", init_pos) - init_line = src[init_pos:line_end] - - # Add reaper state after this line - reaper_init = init_line + """ - # [PATCHED] idle-kv-reaper state - self._idle_kv_reaper_ts = 0.0 - self._idle_kv_reaper_active = False""" - - src = src.replace(init_line, reaper_init, 1) - - # Now add the reaper logic at the end of _update_from_kv_xfer_finished - # Find the finished_sending handler we patched - send_handler = """ for req_id in kv_connector_output.finished_sending or (): - logger.debug("Finished sending KV transfer for request %s", req_id) - if req_id not in self.requests: - logger.debug("Request %s already removed, skipping send", req_id) - continue - self._free_blocks(self.requests[req_id])""" - - reaper_logic = send_handler + """ - - # [PATCHED] idle-kv-reaper — force-free leaked prefill KV blocks - import time as _time - _REAPER_IDLE_SECS = 5.0 - _num_running = sum(1 for r in self.requests.values() - if r.status == RequestStatus.RUNNING) - _should_reap = (_num_running == 0) - - if _should_reap: - if not self._idle_kv_reaper_active: - self._idle_kv_reaper_active = True - self._idle_kv_reaper_ts = _time.monotonic() - elif _time.monotonic() - self._idle_kv_reaper_ts > _REAPER_IDLE_SECS: - _reaped = 0 - _reap_ids = [] - for _rid, _req in list(self.requests.items()): - if RequestStatus.is_finished(_req.status): - _reap_ids.append(_rid) - for _rid in _reap_ids: - try: - _req = self.requests[_rid] - self._free_blocks(_req) - _reaped += 1 - except Exception as _e: - logger.debug("[KV-REAPER] free_blocks failed for %s: %s", _rid, _e) - if _reaped > 0: - logger.warning( - "[KV-REAPER] Force-freed blocks for %d finished " - "requests after %.1fs idle", - _reaped, _time.monotonic() - self._idle_kv_reaper_ts) - self._idle_kv_reaper_ts = _time.monotonic() - else: - self._idle_kv_reaper_active = False""" - - if send_handler in src: - src = src.replace(send_handler, reaper_logic, 1) - else: - print("[SETUP] WARN: send handler not found for reaper injection") - sys.exit(0) - - open(f, "w").write(src) - print("[SETUP] Patched: idle KV block reaper for prefill") - -except Exception as e: - print(f"[SETUP] WARN patch idle-kv-reaper: {e}", file=sys.stderr) -' - _SETUP_INSTALLED+=("idle-kv-reaper") -} - # --------------------------------------------------------------------------- # SGLang: Patch aiter gluon pa_mqa_logits — fix 2D → 3D instr_shape for # Triton ≥ 3.5. @@ -742,11 +192,6 @@ install_transformers_glm5() { if [[ "$ENGINE" == "vllm-disagg" ]]; then install_recipe_deps install_amd_quark - patch_moriio_save_kv_timeout - patch_moriio_transfer_timeout - patch_moriio_load_kv_timeout - patch_scheduler_read_mode_fix - patch_prefill_idle_kv_reaper # ========================================================================= # vLLM: Export UCX/RIXL paths (persists since this file is sourced) diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index fa3d65418..fc91a78e8 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -102,7 +102,6 @@ export PROFILER_ARGS=$profiler_args # Engine-specific xP/yD semantics and TP exports if [[ "$ENGINE" == "vllm-disagg" ]]; then export PROXY_STREAM_IDLE_TIMEOUT=${PROXY_STREAM_IDLE_TIMEOUT:-300} - export VLLM_MORIIO_CONNECTOR_READ_MODE=${VLLM_MORIIO_CONNECTOR_READ_MODE:-1} fi # xP = prefill workers, yD = decode workers (may span multiple nodes) export xP=$PREFILL_WORKERS diff --git a/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh new file mode 100644 index 000000000..a9a28d889 --- /dev/null +++ b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + CONC_LIST \ + ISL \ + OSL \ + IMAGE \ + SPEC_DECODING \ + MODEL_PATH \ + PREFILL_NUM_WORKERS \ + PREFILL_TP \ + PREFILL_EP \ + PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS \ + DECODE_TP \ + DECODE_EP \ + DECODE_DP_ATTN \ + PREFILL_NODES \ + DECODE_NODES \ + RANDOM_RANGE_RATIO \ + FRAMEWORK + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +set -x + +cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 + +export TIME_LIMIT="08:00:00" +export MODEL_PATH=$MODEL_PATH +export MODEL_NAME=$MODEL_NAME +export CONTAINER_IMAGE=$IMAGE + +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then + export PREFILL_ENABLE_EP=false +else + export PREFILL_ENABLE_EP=true +fi + +if [[ "$PREFILL_DP_ATTN" == "true" ]]; then + export PREFILL_ENABLE_DP=true +else + export PREFILL_ENABLE_DP=false +fi + +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then + export DECODE_ENABLE_EP=false +else + export DECODE_ENABLE_EP=true +fi + +if [[ "$DECODE_DP_ATTN" == "true" ]]; then + export DECODE_ENABLE_DP=true +else + export DECODE_ENABLE_DP=false +fi + +JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ + ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ + ${RANDOM_RANGE_RATIO} \ + "${NODELIST:-}") + +if [[ $? -ne 0 ]]; then + echo "Failed to submit job" >&2 + exit 1 +fi + +echo "$JOB_ID" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index eb47ba6ae..3b3735b1a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4014,3 +4014,15 @@ - "1k/1k: 1p1d-dep4-dep8 (conc 4096,12288), 1p4d-dep4-tp8 (conc 4-128), 1p1d-dep4-dep16 (conc 4096,6144)" - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm-disagg + description: + - "Initial submission: MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3) — 1 prefill (TP8) + 1 decode (TP8) across conc 1,2,4,8,16, validating the MoRI-IO KV-transfer disagg pipeline end-to-end for M3" + - "Layered on the MoRI-IO patch-removal infra (#1585): uses benchmarks/multi_node/amd_utils with the runtime MoRI patches removed" + - "Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128 (MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, minimax_m3 parsers; no EP (TP8, MoE experts TP-sharded)" + - "M3 disagg script points MODEL_PATH at the cluster's shared HF cache (/it-share/hf-hub-cache) where the ~414 GB MiniMax-M3-MXFP8 checkpoint is pre-staged, instead of the launcher default /it-share/data; scoped to M3 only (other disagg models keep /it-share/data)" + - "Sweeps conc 1,2,4,8,16,32,64,128,256,512,1024 at both 1k1k and 8k1k (1P TP8 + 1D TP8). The 8k1k point makes the multi-node eval policy (8k1k + conc >= 16) mark one lm-eval on the highest-max-conc layout (eval-conc=median), validating the disagg pipeline's correctness; run with non-canary-full-sweep-enabled so the eval entry actually runs" + - "Adds two asymmetric prefill/decode layouts at both 1k1k and 8k1k alongside the TP8+TP8 sweep: 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1,2,4,8,16,32,64,128,256; and balanced 1P TP4 + 1D TP4 at conc 64,128,256,512,1024. Per-worker TP comes from the master-config prefill/decode tp (server_vllm.sh rewrites the models_vllm.yaml --tensor-parallel-size placeholder); no EP, dp-attn off, PREFILL_NODES=1/DECODE_NODES=1 (TP4 uses half an 8-GPU node)" + - "Adds a 2P TP4 + 1D TP8 layout at both 1k1k and 8k1k for high conc 256,512,768,1024: two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an 8-GPU node) feeding one TP8 decode (DECODE_NODES=1); 3 nodes total" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762 From aaadc7b8d6f19f5aafcb45d37cfcefe52c8df062 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 21:42:36 -0400 Subject: [PATCH 02/20] amd_utils/job.slurm: auto-download disagg checkpoint when not pre-staged The first MI355X disagg sweep (run 27515119215) failed: the day-zero MiniMax-M3-MXFP8 checkpoint is not staged on the disagg cluster's shared FS, so job.slurm's model search hit a hard FATAL ("Model 'MiniMax-M3-MXFP8' not found. Searched: ...") before the engine ever started. The single-node recipes hf-download inside the serving container, but the disagg path historically required ops to pre-stage checkpoints. Add an on-demand fallback to the vllm-disagg model-resolution block: when the checkpoint isn't found, derive the HF repo id from the hf_dir (models--org--name -> org/name) and download into MODEL_DIR in HF cache layout, then resolve the snapshot as MODEL_PATH. Staging into MODEL_DIR keeps MODEL_PATH under the dir that is bind-mounted into the serving container as /models, so the existing -v ${MODEL_DIR}:/models mount and DOCKER_MODEL_PATH (/models) remap both resolve. Implementation notes: - The host has no hf CLI, so the download runs in a one-shot container of the serving image (DOCKER_IMAGE_NAME), which ships huggingface_hub. - flock on a lockfile in MODEL_DIR serializes the prefill/decode nodes; a re-check of snapshots/ under the lock makes it idempotent (resumable). - hf download with a huggingface-cli fallback; 3 retries; HF_TOKEN passed through for gated repos. - Scoped to the vllm-disagg branch only; pre-staged models never reach this path (the search finds them first), so sglang/atom and existing vLLM disagg models (M2.5/Kimi) are unaffected. Co-Authored-By: Claude Fable 5 --- benchmarks/multi_node/amd_utils/job.slurm | 54 +++++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 67160c262..bbbaa8ef4 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -165,9 +165,57 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then done if [[ -z "$MODEL_PATH" ]]; then - echo "FATAL: Model '$MODEL_NAME' not found. Searched:" - for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done - exit 1 + # Not pre-staged. Unlike the single-node recipes (which hf-download inside + # the serving container), the disagg path historically required ops to + # pre-stage checkpoints, so day-zero models (e.g. MiniMax-M3) FATAL here. + # Auto-stage on demand into MODEL_DIR — the dir mounted into the serving + # container as /models — so the resolved MODEL_PATH stays under MODEL_DIR + # and both the `-v ${MODEL_DIR}:/models` mount and the DOCKER_MODEL_PATH + # (/models) remap resolve. Pre-staged models never reach this branch. + # The host has no hf CLI, so the download runs in a one-shot container of + # the serving image (which ships huggingface_hub). A flock serializes the + # prefill/decode nodes; the re-check under the lock makes it idempotent. + repo_id="$DISK_DIR_NAME" + if [[ "$repo_id" == models--* ]]; then + repo_id="${repo_id#models--}"; repo_id="${repo_id/--//}" + fi + if [[ "$repo_id" != */* ]]; then + echo "FATAL: Model '$MODEL_NAME' not found and cannot derive an HF repo" + echo " id from hf_dir '$DISK_DIR_NAME' to auto-download. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 + fi + if ! mkdir -p "$MODEL_DIR" 2>/dev/null || [[ ! -w "$MODEL_DIR" ]]; then + echo "FATAL: Model '$MODEL_NAME' not pre-staged and MODEL_DIR" + echo " '$MODEL_DIR' is not writable for auto-download." + exit 1 + fi + echo "Model '$MODEL_NAME' not pre-staged; auto-downloading '$repo_id' into $MODEL_DIR (HF cache layout)" + if docker ps >/dev/null 2>&1; then DK=docker; else DK="sudo docker"; fi + ( + exec 9>"${MODEL_DIR}/.stage-${DISK_DIR_NAME}.lock" + flock -w 10800 9 || { echo "FATAL: timed out waiting for model-stage lock"; exit 1; } + if [[ ! -d "${MODEL_DIR}/${DISK_DIR_NAME}/snapshots" ]]; then + for attempt in 1 2 3; do + $DK run --rm --network host \ + -v "${MODEL_DIR}:${MODEL_DIR}" \ + -e HF_HUB_CACHE="${MODEL_DIR}" \ + -e HF_TOKEN="${HF_TOKEN:-}" \ + "$DOCKER_IMAGE_NAME" \ + bash -lc "hf download '$repo_id' || huggingface-cli download '$repo_id'" && break + [[ $attempt == 3 ]] && { echo "FATAL: hf download failed after $attempt attempts"; exit 1; } + echo "hf download attempt $attempt failed; retrying in 60s"; sleep 60 + done + fi + ) || exit 1 + RESOLVED=$(resolve_hf_cache_path "${MODEL_DIR}/${DISK_DIR_NAME}") + if [[ -d "$RESOLVED" ]]; then + MODEL_PATH="$RESOLVED" + echo "Auto-staged MODEL_PATH: $MODEL_PATH" + else + echo "FATAL: '$MODEL_NAME' still not found after auto-download at ${MODEL_DIR}/${DISK_DIR_NAME}" + exit 1 + fi fi echo "Final MODEL_PATH: $MODEL_PATH" else From 7bfdc822bd093721fa4bbafd2d73ffb2524e5042 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 22:36:56 -0400 Subject: [PATCH 03/20] job.slurm: --entrypoint "" for the auto-download container The disagg auto-download reached hf download but failed all 3 attempts: the one-shot `docker run "$DOCKER_IMAGE_NAME" bash -lc "hf download ..."` did not override the image ENTRYPOINT, so the vllm-openai API server ran with the bash command as its args and died with "Failed to infer device type" (no GPU mounted in the download container). Add --entrypoint "" (as the serving container does) so bash actually runs hf download. Co-Authored-By: Claude Fable 5 --- benchmarks/multi_node/amd_utils/job.slurm | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index bbbaa8ef4..2eb0d7294 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -197,7 +197,11 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then flock -w 10800 9 || { echo "FATAL: timed out waiting for model-stage lock"; exit 1; } if [[ ! -d "${MODEL_DIR}/${DISK_DIR_NAME}/snapshots" ]]; then for attempt in 1 2 3; do + # --entrypoint "" so bash runs hf download; the vllm-openai + # image's default entrypoint is the API server, which would + # otherwise try (and fail) to infer a GPU device here. $DK run --rm --network host \ + --entrypoint "" \ -v "${MODEL_DIR}:${MODEL_DIR}" \ -e HF_HUB_CACHE="${MODEL_DIR}" \ -e HF_TOKEN="${HF_TOKEN:-}" \ From 44c6547447e953f3180af2e2c6e9a9f59c2908ce Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Sun, 14 Jun 2026 22:53:29 -0400 Subject: [PATCH 04/20] M3 disagg: use shared HF cache (/it-share/hf-hub-cache); drop auto-download Per maintainer direction, point the MiniMax-M3 disagg model dir at the cluster's shared HF cache where the ~414 GB MXFP8 checkpoint is already staged (/it-share/hf-hub-cache/models--MiniMaxAI--MiniMax-M3-MXFP8), instead of the launcher default /it-share/data. Scoped to M3 only via the M3 disagg script: export MODEL_PATH=/it-share/hf-hub-cache submit.sh exports MODEL_DIR=$MODEL_PATH and job.slurm resolves the snapshot under it (search path #1) and bind-mounts MODEL_DIR into the prefill/decode serving containers. Other disagg models keep /it-share/data. This supersedes the earlier job.slurm auto-download approach, which is reverted: job.slurm now differs from main only by the #1585 mori-removal hunks (router image bump + dropping VLLM_MORIIO_CONNECTOR_READ_MODE). Co-Authored-By: Claude Fable 5 --- benchmarks/multi_node/amd_utils/job.slurm | 58 +------------------ .../minimaxm3_fp8_mi355x_vllm-disagg.sh | 7 ++- 2 files changed, 9 insertions(+), 56 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 2eb0d7294..67160c262 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -165,61 +165,9 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then done if [[ -z "$MODEL_PATH" ]]; then - # Not pre-staged. Unlike the single-node recipes (which hf-download inside - # the serving container), the disagg path historically required ops to - # pre-stage checkpoints, so day-zero models (e.g. MiniMax-M3) FATAL here. - # Auto-stage on demand into MODEL_DIR — the dir mounted into the serving - # container as /models — so the resolved MODEL_PATH stays under MODEL_DIR - # and both the `-v ${MODEL_DIR}:/models` mount and the DOCKER_MODEL_PATH - # (/models) remap resolve. Pre-staged models never reach this branch. - # The host has no hf CLI, so the download runs in a one-shot container of - # the serving image (which ships huggingface_hub). A flock serializes the - # prefill/decode nodes; the re-check under the lock makes it idempotent. - repo_id="$DISK_DIR_NAME" - if [[ "$repo_id" == models--* ]]; then - repo_id="${repo_id#models--}"; repo_id="${repo_id/--//}" - fi - if [[ "$repo_id" != */* ]]; then - echo "FATAL: Model '$MODEL_NAME' not found and cannot derive an HF repo" - echo " id from hf_dir '$DISK_DIR_NAME' to auto-download. Searched:" - for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done - exit 1 - fi - if ! mkdir -p "$MODEL_DIR" 2>/dev/null || [[ ! -w "$MODEL_DIR" ]]; then - echo "FATAL: Model '$MODEL_NAME' not pre-staged and MODEL_DIR" - echo " '$MODEL_DIR' is not writable for auto-download." - exit 1 - fi - echo "Model '$MODEL_NAME' not pre-staged; auto-downloading '$repo_id' into $MODEL_DIR (HF cache layout)" - if docker ps >/dev/null 2>&1; then DK=docker; else DK="sudo docker"; fi - ( - exec 9>"${MODEL_DIR}/.stage-${DISK_DIR_NAME}.lock" - flock -w 10800 9 || { echo "FATAL: timed out waiting for model-stage lock"; exit 1; } - if [[ ! -d "${MODEL_DIR}/${DISK_DIR_NAME}/snapshots" ]]; then - for attempt in 1 2 3; do - # --entrypoint "" so bash runs hf download; the vllm-openai - # image's default entrypoint is the API server, which would - # otherwise try (and fail) to infer a GPU device here. - $DK run --rm --network host \ - --entrypoint "" \ - -v "${MODEL_DIR}:${MODEL_DIR}" \ - -e HF_HUB_CACHE="${MODEL_DIR}" \ - -e HF_TOKEN="${HF_TOKEN:-}" \ - "$DOCKER_IMAGE_NAME" \ - bash -lc "hf download '$repo_id' || huggingface-cli download '$repo_id'" && break - [[ $attempt == 3 ]] && { echo "FATAL: hf download failed after $attempt attempts"; exit 1; } - echo "hf download attempt $attempt failed; retrying in 60s"; sleep 60 - done - fi - ) || exit 1 - RESOLVED=$(resolve_hf_cache_path "${MODEL_DIR}/${DISK_DIR_NAME}") - if [[ -d "$RESOLVED" ]]; then - MODEL_PATH="$RESOLVED" - echo "Auto-staged MODEL_PATH: $MODEL_PATH" - else - echo "FATAL: '$MODEL_NAME' still not found after auto-download at ${MODEL_DIR}/${DISK_DIR_NAME}" - exit 1 - fi + echo "FATAL: Model '$MODEL_NAME' not found. Searched:" + for p in "${SEARCH_PATHS[@]}"; do echo " - $p"; done + exit 1 fi echo "Final MODEL_PATH: $MODEL_PATH" else diff --git a/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh index a9a28d889..f54940e29 100644 --- a/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh +++ b/benchmarks/multi_node/minimaxm3_fp8_mi355x_vllm-disagg.sh @@ -31,7 +31,12 @@ set -x cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 export TIME_LIMIT="08:00:00" -export MODEL_PATH=$MODEL_PATH +# MiniMax-M3 MXFP8 (~414 GB) is pre-staged in this cluster's shared HF cache +# (/it-share/hf-hub-cache/models--MiniMaxAI--MiniMax-M3-MXFP8), not the default +# /it-share/data the launcher sets. Point the disagg model dir there for M3 only; +# submit.sh exports MODEL_DIR=$MODEL_PATH and job.slurm resolves the snapshot under +# it and bind-mounts MODEL_DIR into the prefill/decode serving containers. +export MODEL_PATH=/it-share/hf-hub-cache export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE From 718444cb7139c9133c044677d64382faa00394db Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Mon, 15 Jun 2026 01:25:02 -0400 Subject: [PATCH 05/20] disagg #1762: add 8k1k conc-16 row to run an lm-eval (validate correctness) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The conc-1 1k1k smoke test never triggered an eval — the multi-node eval policy only marks 8k1k entries with conc >= MIN_EVAL_CONC (16). Add an 8k1k conc-16 row (same 1P TP8 + 1D TP8 layout) so mark_eval_entries marks it run-eval=true (eval-conc=16), running lm-eval through the MoRI-IO disagg pipeline to validate correctness. The conc-1 1k1k row stays the latency smoke test. Run with non-canary-full-sweep-enabled so the (non-min-conc) eval entry runs. Co-Authored-By: Claude Fable 5 --- .github/configs/amd-master.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d7433e4d1..145897f36 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2766,3 +2766,26 @@ minimaxm3-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + # 8k1k conc-16 row (same 1P TP8 + 1D TP8 layout) exists so the multi-node + # eval policy (8k1k + conc >= MIN_EVAL_CONC=16) marks an lm-eval — validates + # the M3 MoRI-IO disagg pipeline's correctness end-to-end. The conc-1 1k1k + # row above stays the latency smoke test. + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 16 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" From 84c8d8ecdf1deb104513e95c06c556ab5a421509 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Mon, 15 Jun 2026 01:28:42 -0400 Subject: [PATCH 06/20] disagg #1762: sweep conc 1,2,4,8,16 (not just conc 1) Widen the 1k1k disagg latency/throughput sweep from conc 1 to conc 1,2,4,8,16 (1P TP8 + 1D TP8). The 8k1k conc-16 eval row is unchanged. Co-Authored-By: Claude Fable 5 --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 145897f36..6286c3766 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2751,7 +2751,7 @@ minimaxm3-fp8-mi355x-vllm-disagg: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 1 ] + conc-list: [ 1, 2, 4, 8, 16 ] prefill: num-worker: 1 tp: 8 From c9a10e081d1d880a25e251f5fc98853fbe76f327 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Mon, 15 Jun 2026 01:29:28 -0400 Subject: [PATCH 07/20] disagg #1762: sweep conc 1,2,4,8,16 at both 1k1k and 8k1k Widen the disagg sweep from conc 1 to conc 1,2,4,8,16 for both seq-len scenarios (1P TP8 + 1D TP8). The 8k1k conc-16 point keeps the multi-node eval marked (eval-conc=16) so lm-eval still validates the MoRI-IO disagg pipeline. Co-Authored-By: Claude Fable 5 --- .github/configs/amd-master.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 6286c3766..ad879f894 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2766,15 +2766,15 @@ minimaxm3-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - # 8k1k conc-16 row (same 1P TP8 + 1D TP8 layout) exists so the multi-node - # eval policy (8k1k + conc >= MIN_EVAL_CONC=16) marks an lm-eval — validates - # the M3 MoRI-IO disagg pipeline's correctness end-to-end. The conc-1 1k1k - # row above stays the latency smoke test. + # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc 1,2,4,8,16. The + # conc-16 point also makes the multi-node eval policy (8k1k + conc >= 16) mark + # an lm-eval (eval-conc=16) — validating the M3 MoRI-IO disagg pipeline's + # correctness end-to-end. - isl: 8192 osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 16 ] + conc-list: [ 1, 2, 4, 8, 16 ] prefill: num-worker: 1 tp: 8 From 299c401029692e371e9aba2fac4eaf3ca7d408b1 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 17 Jun 2026 16:16:37 +0000 Subject: [PATCH 08/20] Update the vLLM external router container vllm/vllm-router only retains ~16 recent nightlies on Docker Hub; older dated tags are garbage-collected (manifest unknown), which makes `docker run` fail with exit 125 on any node that has not already cached the image. --- benchmarks/multi_node/amd_utils/job.slurm | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 67160c262..71503f228 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -315,8 +315,10 @@ export IS_MULTINODE="${IS_MULTINODE:-false}" SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_') export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}" -# vLLM external router container -VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260603-e667ebb}" +# vLLM external router container. +# NOTE: vllm/vllm-router only retains ~16 recent nightlies on Docker Hub; older +# dated tags are garbage-collected (manifest unknown) +VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260617-e667ebb}" ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}" From 08be1aacb6d3b7274b57ed4e84a08f8c1f154320 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Thu, 18 Jun 2026 05:20:43 +0000 Subject: [PATCH 09/20] M3 disagg: per-layer MoRIIO KV transfer for hybrid sparse-attn (partial) MiniMax-M3 (MiniMaxM3SparseForCausalLM) is a hybrid sparse-attention model: sparse layers register a separate lightning-indexer cache (MLAAttentionSpec, rank-3, bf16, key-only) alongside the main cache (FullAttentionSpec, rank-5, fp8, K+V). The MoRIIO connector assumes one uniform KV layout -- it derives block geometry from the first cache and reuses first_layer's offsets for every layer (see its own "hybrid attn" TODO) -- so the bf16 key-only index cache is transferred with fp8 K+V sizing and gets corrupted on the decode worker, producing garbage output (disagg gsm8k ~= 0 while single-node M3 is correct). This is the vLLM analogue of the SGLang MoRI DSA-state bug in patches/mori_conn.py. - patches/moriio_heterogeneous_kv.py: compute the READ-path transfer geometry per layer (own shape/stride/dtype/rank) instead of from the first cache. Idempotent; no-op for homogeneous models. - setup_deps.sh: apply it on the vllm-disagg path. NOTE: partial fix -- necessary but not yet sufficient. The index cache is also a separate KV-cache group whose block-table/num_blocks the single-namespace MoRIIO connector cannot map, so M3 disagg accuracy is still broken pending a larger multi-group / index-state transfer change. (Disabling sparse attention is not a viable workaround: M3's fused QKV carries index_k weights, so dropping the indexer breaks weight load.) Refs #1762 Co-authored-by: Cursor --- .../patches/moriio_heterogeneous_kv.py | 145 ++++++++++++++++++ benchmarks/multi_node/amd_utils/setup_deps.sh | 23 +++ 2 files changed, 168 insertions(+) create mode 100644 benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py diff --git a/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py b/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py new file mode 100644 index 000000000..a7ee8c724 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +"""Patch vLLM's MoRIIOConnector to transfer heterogeneous KV caches per-layer. + +Why +--- +MiniMax-M3 (MiniMaxM3SparseForCausalLM) is a hybrid sparse-attention model: + + * main attention layers register a ``FullAttentionSpec`` KV cache: + rank-5 ``[2, num_blocks, block_size, num_kv_heads, head_dim]``, **fp8**, K+V + * the lightning indexer (sparse layers) registers a separate + ``MLAAttentionSpec`` index cache (``MiniMaxM3IndexerCache``): + rank-3 ``[num_blocks, block_size, head_dim]``, **bf16**, key-only + +The upstream MoRIIOConnector assumes a *single uniform* KV layout: it derives +``self.kv_cache_shape`` / ``block_len`` / ``element_size`` from the **first** +cache, and ``_read_blocks`` computes the transfer offsets **once** from +``first_layer`` and reuses them for **every** layer (see the in-code TODO +"block_len needs to be per-layer for ... hybrid attn"). For M3 this transfers +the bf16 key-only rank-3 index cache using the fp8 K+V rank-5 main-cache sizing, +corrupting the indexer state on the decode worker. The sparse layers then select +the wrong KV blocks and the model emits incoherent tokens (gsm8k ~= 0). + +This is the vLLM analogue of the already-shipped SGLang MoRI DSA fix in +``patches/mori_conn.py`` (see patches/README.md). + +Fix +--- +Compute transfer geometry **per layer** from each layer's own tensor +(``shape`` / ``stride`` / ``element_size`` / rank), instead of from the first +cache. For homogeneous models every layer's geometry equals the first cache's, +so behaviour is unchanged; only hybrid models (M3) are affected. + +Two minimal, targeted edits (READ path, which the M3 recipe uses with +``read_mode: true``): + + 1. ``_compute_block_transfer_offsets`` -> use ``self.kv_caches[layer_name]``'s + own shape (rank/dims) instead of the global ``self.kv_cache_shape``. + 2. ``_read_blocks`` -> call ``_compute_block_transfer_offsets`` inside the + per-layer loop instead of once for ``first_layer``. + +Idempotent: re-running detects the ``PATCHED heterogeneous-kv`` marker and exits. +""" +import os +import sys + + +def _default_target() -> str: + try: + import vllm + except Exception: + return "" + return os.path.join( + os.path.dirname(vllm.__file__), + "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py", + ) + + +OLD1 = ''' assert self.kv_cache_shape is not None, "KV caches shape not initialized" + is_mla = len(self.kv_cache_shape) == 3 + stride = self.kv_caches[layer_name].stride() + sz = self.kv_caches[layer_name].element_size() + if is_mla: + blknum, blksize, hs = self.kv_cache_shape + hn = 1 + block_stride = stride[0] + else: + _, blknum, blksize, hn, hs = self.kv_cache_shape''' + +NEW1 = ''' # [PATCHED heterogeneous-kv] Use this layer's own shape so caches with a + # different rank/dtype (MiniMax-M3: bf16 key-only rank-3 index cache vs + # fp8 K+V rank-5 main cache) are sized per-layer, not from the first cache. + layer_shape = tuple(self.kv_caches[layer_name].shape) + assert layer_shape, "KV caches shape not initialized" + is_mla = len(layer_shape) == 3 + stride = self.kv_caches[layer_name].stride() + sz = self.kv_caches[layer_name].element_size() + if is_mla: + blknum, blksize, hs = layer_shape + hn = 1 + block_stride = stride[0] + else: + _, blknum, blksize, hn, hs = layer_shape''' + +OLD2 = ''' first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0] + offs = self._compute_block_transfer_offsets( + first_layer, local_block_ids, remote_block_ids, remote_moriio_meta + ) + + for layer_name in self.layer_name_to_local_kv_cache_metadata: + sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( + layer_name + ) + # TODO : apply multi-session batch-read when moriio support it + transfer_status = self.moriio_wrapper.read_remote_data( + offs[2], offs[0], offs[1], sessions[sess_idx] + )''' + +NEW2 = ''' for layer_name in self.layer_name_to_local_kv_cache_metadata: + sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( + layer_name + ) + # [PATCHED heterogeneous-kv] Per-layer offsets so the bf16 key-only + # MiniMax-M3 index cache is transferred with its own geometry instead + # of the first (main fp8 K+V) layer's. + offs = self._compute_block_transfer_offsets( + layer_name, local_block_ids, remote_block_ids, remote_moriio_meta + ) + # TODO : apply multi-session batch-read when moriio support it + transfer_status = self.moriio_wrapper.read_remote_data( + offs[2], offs[0], offs[1], sessions[sess_idx] + )''' + + +def main() -> int: + target = sys.argv[1] if len(sys.argv) > 1 else _default_target() + if not target or not os.path.isfile(target): + print(f"[PATCH] moriio_connector.py not found ({target!r}); skipping") + return 0 + src = open(target).read() + if "PATCHED heterogeneous-kv" in src: + print("[PATCH] moriio heterogeneous-kv already applied") + return 0 + if OLD1 not in src: + print("[PATCH] WARN: _compute_block_transfer_offsets pattern not found; " + "connector version changed — skipping (no-op)") + return 0 + if OLD2 not in src: + print("[PATCH] WARN: _read_blocks pattern not found; " + "connector version changed — skipping (no-op)") + return 0 + src = src.replace(OLD1, NEW1, 1).replace(OLD2, NEW2, 1) + # Validate it still compiles before writing. + try: + compile(src, target, "exec") + except SyntaxError as e: + print(f"[PATCH] ERROR: patched source fails to compile: {e}") + return 1 + open(target, "w").write(src) + print("[PATCH] Applied: moriio heterogeneous-kv per-layer transfer " + "(MiniMax-M3 sparse index cache)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 35eaf17dc..3e5d82c0c 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -185,6 +185,28 @@ install_transformers_glm5() { _SETUP_INSTALLED+=("transformers-glm5") } +# --------------------------------------------------------------------------- +# vLLM: Patch MoRIIOConnector for heterogeneous (hybrid sparse-attn) KV caches. +# +# MiniMax-M3 registers a bf16 key-only rank-3 lightning-indexer cache alongside +# the fp8 K+V rank-5 main cache. Upstream MoRIIO derives one uniform block +# geometry from the first cache and reuses the first layer's transfer offsets +# for every layer, corrupting the index cache on the decode worker -> garbage +# output (gsm8k ~= 0). The overlay makes the READ path compute geometry/offsets +# per layer. Idempotent; no-op on connector versions that don't match. +# See patches/moriio_heterogeneous_kv.py and patches/README.md. +# --------------------------------------------------------------------------- +patch_moriio_heterogeneous_kv() { + local patcher + patcher="$(dirname "${BASH_SOURCE[0]}")/patches/moriio_heterogeneous_kv.py" + if [[ ! -f "$patcher" ]]; then + echo "[SETUP] moriio heterogeneous-kv patcher not found, skipping" + return 0 + fi + python3 "$patcher" || echo "[SETUP] WARN: moriio heterogeneous-kv patch returned non-zero" + _SETUP_INSTALLED+=("moriio-heterogeneous-kv") +} + # ============================================================================= # Run installers (engine-gated) # ============================================================================= @@ -192,6 +214,7 @@ install_transformers_glm5() { if [[ "$ENGINE" == "vllm-disagg" ]]; then install_recipe_deps install_amd_quark + patch_moriio_heterogeneous_kv # ========================================================================= # vLLM: Export UCX/RIXL paths (persists since this file is sourced) From 005e16b483493b9c850fcb74da780797f9108a74 Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Thu, 18 Jun 2026 07:31:48 +0000 Subject: [PATCH 10/20] feat(amd-disagg): add vLLM MoRIIO KV-layout patch to reuse stock minimax-m3 image The vLLM MoRIIOConnector in vllm/vllm-openai-rocm:minimax-m3 assumes the FlashAttention KV layout [2, num_blocks, ...] (K/V axis outer) but this vLLM's backends allocate [num_blocks, 2, ...] (K/V axis inner), so every disagg block transfer reads the wrong region. Invisible to throughput, but corrupts GQA/non-MLA accuracy (MiniMax-M3 gsm8k 0.0008 -> 0.957). Instead of baking a fix into a rebuilt image (-hetkv) or carrying full vendored copies of the patched files in-tree, carry just the 218-line unified diff (patches/moriio/moriio-kv-layout-fix.diff) and apply it with `patch -p1` against the vLLM package dir inside the container at startup, ahead of the server launch. The repo is already bind-mounted into the container, so no EXTRA_DOCKER_MOUNTS wiring is needed -- job.slurm auto-applies the diff when DOCKER_IMAGE_NAME contains "minimax-m3" (skippable with MORIIO_KV_PATCH=skip), mirroring the existing mori_conn.py sglang hook. A failed apply aborts the container instead of silently running unpatched. Validated on a manual 2-node run (n06-21 prefill+router / n09-21 decode) using the STOCK image: gsm8k strict-match 0.9568 / flexible-extract 0.9560 (matches the baked image within noise), decode probe healthy. - patches/moriio/moriio-kv-layout-fix.diff: unified diff vs stock - job.slurm: in-container `patch` step, MORIIO_KV_PATCH=skip opt-out - patches/README.md: document the moriio/ diff-apply mechanism Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/job.slurm | 27 +++ .../multi_node/amd_utils/patches/README.md | 90 +++++++- .../patches/moriio/moriio-kv-layout-fix.diff | 218 ++++++++++++++++++ 3 files changed, 324 insertions(+), 11 deletions(-) create mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 71503f228..727f64632 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -81,6 +81,32 @@ if [[ "${MORI_CONN_PATCH:-auto}" != "skip" ]] \ echo "[job.slurm] auto-applied MoRI conn.py overlay: ${_MORI_PATCH_FILE}" fi +# ── In-tree vLLM MoRIIO patch: auto-apply for known-affected images ── +# The vLLM MoRIIOConnector (image vllm/vllm-openai-rocm:minimax-m3) ships a +# transposed-KV-layout bug: it assumes the FlashAttention layout +# [2, num_blocks, ...] (K/V axis outer) but this vLLM's backends allocate +# [num_blocks, 2, ...] (K/V axis inner), so every disagg block transfer reads +# the wrong region. Invisible to throughput, but corrupts GQA/non-MLA accuracy +# (MiniMax-M3 gsm8k 0.0008 -> 0.958). Fix ships as a unified diff (see +# patches/moriio/ and patches/README.md), applied to the vLLM package dir +# inside the container at startup, ahead of the server launch below. +# +# Auto-applied when the image tag contains "minimax-m3" (and not the already- +# fixed "-hetkv" rebuild), unless the caller sets MORIIO_KV_PATCH=skip. The +# repo is already bind-mounted at DOCKER_MOUNT_PATH ("/workspace"), so the +# diff needs no extra mount -- just an in-container `patch` call. A failed +# apply aborts the container: silently running unpatched would silently +# corrupt accuracy, not just skip a feature. +_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff" +_MORIIO_PATCH_CMD="" +if [[ "${MORIIO_KV_PATCH:-auto}" != "skip" ]] \ + && [[ -f "$_MORIIO_DIFF" ]] \ + && [[ "${DOCKER_IMAGE_NAME:-}" == *"minimax-m3"* ]] \ + && [[ "${DOCKER_IMAGE_NAME:-}" != *"hetkv"* ]]; then + _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff || exit 1" + echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout diff inside container: ${_MORIIO_DIFF}" +fi + xP="${xP:-1}" yD="${yD:-1}" @@ -593,6 +619,7 @@ fi \"$DOCKER_IMAGE_NAME\" bash -lc ' set -o pipefail mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' + '"${_MORIIO_PATCH_CMD:-}"' '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log ' diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md index d9b5de79d..27f9fc81d 100644 --- a/benchmarks/multi_node/amd_utils/patches/README.md +++ b/benchmarks/multi_node/amd_utils/patches/README.md @@ -1,16 +1,25 @@ -# In-tree sglang patches for the MoRI PD-disagg path +# In-tree patches for the MoRI / MoRIIO PD-disagg path -This directory carries small Python overlays that get bind-mounted over -the upstream sglang source inside the docker container at runtime. -They are needed because some sglang releases ship known bugs in the -MoRI disaggregation backend that block our benchmark + accuracy -configs. +This directory carries small overlays that fix up the engine source inside +the docker container at runtime. They are needed because some published +images ship known bugs in the (MoRI / MoRIIO) disaggregation backend that +block our benchmark + accuracy configs — so we can keep reusing the +**stock image** instead of rebuilding a patched one. -The mount is wired through the `EXTRA_DOCKER_MOUNTS` env var that -`job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after the -existing `-v` block). The local-test driver scripts under -`scripts/sglang_disagg/` pre-set this env var to the path of the -relevant overlay; CI runners that need the patch can do the same. +- `mori_conn.py` — single-file overlay (bind-mounted) for the **sglang** + MoRI backend. +- `moriio/` — unified-diff overlay (applied with `patch` at container + startup) for the **vLLM** MoRIIO connector (`minimax-m3` image). See its + section below. + +The `mori_conn.py` overlay is wired through the `EXTRA_DOCKER_MOUNTS` env +var that `job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after +the existing `-v` block). The local-test driver scripts under +`scripts/sglang_disagg/` pre-set this env var to the path of the relevant +overlay; CI runners that need the patch can do the same. The `moriio/` +diff needs no extra mount — the repo (and thus the diff file) is already +bind-mounted into the container — `job.slurm` just runs `patch` against it +before launching the server; see "How to enable" in its section below. ## `mori_conn.py` @@ -73,6 +82,65 @@ When this env var is unset (CI default for runs that don't need the patch), `${EXTRA_DOCKER_MOUNTS:-}` expands to the empty string and container behavior is byte-identical to the unpatched path. +## `moriio/` (vLLM MoRIIO connector, MiniMax-M3) + +A unified diff (`moriio-kv-layout-fix.diff`), applied with `patch -p1` +against the vLLM package dir inside the container, touching three files: + +``` +/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/moriio/ + ├── moriio_connector.py + ├── moriio_engine.py + └── moriio_common.py +``` + +Source: forked from the stock `vllm/vllm-openai-rocm:minimax-m3` image +(vLLM `0.22.1rc1.dev490`). + +**Bug (general MoRIIO, not M3-specific):** the connector assumed the +FlashAttention KV layout `[2, num_blocks, block_size, heads, head_dim]` +(K/V axis **outer**), but this vLLM's attention backends (standard +`TRITON_ATTN` **and** the M3 sparse backend) allocate +`[num_blocks, 2, block_size, heads, head_dim]` (K/V axis **inner**). +`_compute_block_transfer_offsets` indexed blocks with `stride[1]` (the +K/V stride) instead of `stride[0]` (the block stride), so every disagg +block transfer read the wrong region. Invisible to throughput +benchmarks (they don't check output); only the **gsm8k accuracy eval** +catches it. The connector was only ever correct for MLA models +(DeepSeek, rank-3 path); MiniMax-M3 is GQA + sparse lightning-indexer +→ broken (disagg gsm8k `0.0008` token salad). + +**Fix** — axis-aware offset computation: detect the block axis + optional +size-2 K/V axis from each layer's real shape/stride, compute offsets per +distinct geometry (handles M3's 2nd geometry, the rank-3 bf16 key-only +indexer cache), `num_blocks = shape[0]`; the WRITE path memoizes offsets +per geometry. Result: disagg gsm8k `strict-match 0.9583 / +flexible-extract 0.9575` (matches single-node). Homogeneous models +(uniform layout) are unaffected — one geometry, one offset set, same +result. Heterogeneous-TP P/D (prefill TP ≠ decode TP) is still a TODO +(same as upstream). Full write-up in +`/apps/ditian12/m3_disagg_manual/moriio_hetkv_fix/README.md`. + +### How to enable + +`job.slurm` auto-applies this diff when `DOCKER_IMAGE_NAME` contains +`minimax-m3` (and not the already-fixed `-hetkv` rebuild), unless the +caller sets `MORIIO_KV_PATCH=skip`. To wire it by hand (e.g. the +`m3_disagg_manual/run_manual_2node.sh` driver, which sets +`MORIIO_KV_PATCH`), run inside the container before the server starts: + +```bash +patch -p1 -d /usr/local/lib/python3.12/dist-packages \ + < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff +``` + +(`$DI_REPO_DIR` is the InferenceX checkout root that `job.slurm` already +mounts into the container at `/workspace`.) + +This lets the **stock** `minimax-m3` image be reused for the E2E +accuracy run — no `-hetkv` rebuild needed. Retire the overlay once the +fix lands in a published image; it is not yet upstreamed. + ## When to use which patch | Image / version | Need `mori_conn.py` overlay? | diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff new file mode 100644 index 000000000..7f6c435bf --- /dev/null +++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff @@ -0,0 +1,218 @@ +diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +index 73694ce32..a30d30af8 100644 +--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py ++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +@@ -80,6 +80,10 @@ class RemoteAllocInfo: + writes_done: int = 0 + decode_dp_rank: int = 0 + transfer_offset: tuple[list[int], list[int], list[int]] | None = None ++ # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for ++ # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a ++ # single entry. See MoRIIOWriter._prepare_transfer_plan. ++ transfer_offsets: dict = field(default_factory=dict) + + + class ROLE(Enum): +diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +index 167eef6e1..1846a3c21 100644 +--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py ++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +@@ -1233,8 +1233,10 @@ class MoRIIOConnectorWorker: + block_size, kv_latent_dim = block_shape + self.slot_size_bytes = kv_elem_size * kv_latent_dim + else: +- # [2 (k and v), num_blocks, ...] +- self.num_blocks = first_kv_cache.shape[1] ++ # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V ++ # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read ++ # shape[1] here, which is the size-2 K/V axis, not num_blocks.) ++ self.num_blocks = first_kv_cache.shape[0] + block_rank = 3 # [block_size, kv_heads, head_dim] + block_shape = first_kv_cache.shape[-block_rank:] + block_size, n_kv_heads, head_dim = block_shape[-3:] +@@ -1257,10 +1259,17 @@ class MoRIIOConnectorWorker: + caches_data = [] + + for cache_or_caches in kv_caches.values(): +- cache_list = [cache_or_caches] if use_mla else cache_or_caches ++ # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs ++ # rank-5 (full attention, [K, V]). A single global use_mla flag ++ # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for ++ # hybrid models, so detect per cache. region_len is the actual tensor ++ # (or K/V half) byte size -- equivalent to num_blocks * block_len for ++ # homogeneous models, correct for heterogeneous ones. ++ cache_is_mla = cache_or_caches.dim() == 3 ++ cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches + for cache in cache_list: + base_addr = cache.data_ptr() +- region_len = self.num_blocks * self.block_len ++ region_len = cache.numel() * cache.element_size() + caches_data.append((base_addr, region_len, cache.device.index, "")) + kv_caches_base_addr.append(base_addr) + +@@ -1665,21 +1674,53 @@ class MoRIIOConnectorWorker: + Tuple of (local_offsets, remote_offsets, transfer_sizes) + """ + assert self.kv_cache_shape is not None, "KV caches shape not initialized" +- is_mla = len(self.kv_cache_shape) == 3 +- stride = self.kv_caches[layer_name].stride() +- sz = self.kv_caches[layer_name].element_size() +- if is_mla: +- blknum, blksize, hs = self.kv_cache_shape +- hn = 1 +- block_stride = stride[0] +- else: +- _, blknum, blksize, hn, hs = self.kv_cache_shape +- local_ktov_stride = stride[0] +- block_stride = stride[1] +- remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks ++ # Per-layer, axis-aware geometry. ++ # ++ # The KV tensors vLLM hands the connector are laid out (verified on ++ # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1): ++ # * main attention (GQA, dense + sparse layers): ++ # shape (num_blocks, 2, block_size, num_kv_heads, head_dim) ++ # -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0. ++ # * sparse lightning-indexer (key-only, bf16): ++ # shape (num_blocks, block_size, head_dim) -- rank 3, no K/V axis. ++ # ++ # The legacy code assumed the FlashAttention-style [2, num_blocks, ...] ++ # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V ++ # stride) -- transposing block vs K/V so every block read the wrong ++ # region (corruption invisible to throughput-only benchmarks). Instead, ++ # detect the block axis (size == num_blocks) and the optional K/V axis ++ # (size 2) from THIS layer's own shape, and derive strides from them. The ++ # per-block stride is independent of num_blocks, so no remote-num_blocks ++ # scaling is needed (homogeneous P/D TP; heterogeneous TP still TODO). ++ layer_cache = self.kv_caches[layer_name] ++ layer_shape = tuple(layer_cache.shape) ++ stride = layer_cache.stride() ++ sz = layer_cache.element_size() ++ rank = len(layer_shape) ++ ++ # K/V axis = the size-2 axis among the two outermost dims (if any). ++ kv_axis: int | None = None ++ if rank >= 4: ++ if layer_shape[0] == 2: ++ kv_axis = 0 ++ elif layer_shape[1] == 2: ++ kv_axis = 1 ++ # Block axis = outermost non-K/V axis (the one indexed by block_id). ++ block_axis = 0 ++ if kv_axis == 0: ++ block_axis = 1 ++ block_stride = stride[block_axis] ++ kv_stride = stride[kv_axis] if kv_axis is not None else 0 ++ per_block = layer_shape[kv_axis] if kv_axis is not None else 1 # 2 (K,V) or 1 ++ ++ # One transferred slab = all dims except the block and K/V axes. ++ slot_elems = 1 ++ for ax in range(rank): ++ if ax == block_axis or ax == kv_axis: ++ continue ++ slot_elems *= layer_shape[ax] ++ transfer_size_byte = slot_elems * sz + +- transfer_size_byte = blksize * hn * hs * sz +- per_block = 1 if is_mla else 2 + total = len(local_block_ids) * per_block + offset_local = [0] * total + offset_remote = [0] * total +@@ -1688,17 +1729,9 @@ class MoRIIOConnectorWorker: + w = 0 + for i, lb in enumerate(local_block_ids): + rb = remote_block_ids[i] +- # K +- offset_local[w] = sz * (lb * block_stride) +- offset_remote[w] = sz * (rb * block_stride) +- w += 1 +- if not is_mla: +- # V +- # Handle num_block variations originating from PD (different kv strides) +- # TODO: address block_sz differences in heterogeneous TP scenarios +- # In MLA, we don't need to consider these two cases. +- offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride) +- offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride) ++ for kv in range(per_block): ++ offset_local[w] = sz * (lb * block_stride + kv * kv_stride) ++ offset_remote[w] = sz * (rb * block_stride + kv * kv_stride) + w += 1 + + merged_l, merged_r, merged_s = self.merge_contiguous_blocks( +@@ -1722,15 +1755,26 @@ class MoRIIOConnectorWorker: + dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0) + sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id) + +- first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0] +- offs = self._compute_block_transfer_offsets( +- first_layer, local_block_ids, remote_block_ids, remote_moriio_meta +- ) +- +- for layer_name in self.layer_name_to_local_kv_cache_metadata: +- sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( +- layer_name ++ # Heterogeneous-KV models register layers with different shapes/dtypes in ++ # a single KV-cache group sharing one block table, so block_ids match ++ # across layers but per-block byte geometry does not. Compute offsets per ++ # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3 ++ # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing. ++ layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys()) ++ offs_by_geom: dict = {} ++ for sess_idx, layer_name in enumerate(layer_names): ++ layer_cache = self.kv_caches[layer_name] ++ geom_key = ( ++ tuple(layer_cache.shape), ++ tuple(layer_cache.stride()), ++ layer_cache.dtype, + ) ++ offs = offs_by_geom.get(geom_key) ++ if offs is None: ++ offs = self._compute_block_transfer_offsets( ++ layer_name, local_block_ids, remote_block_ids, remote_moriio_meta ++ ) ++ offs_by_geom[geom_key] = offs + # TODO : apply multi-session batch-read when moriio support it + transfer_status = self.moriio_wrapper.read_remote_data( + offs[2], offs[0], offs[1], sessions[sess_idx] +diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +index 3ca5f37ca..113eccad0 100644 +--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py ++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +@@ -279,21 +279,33 @@ class MoRIIOWriter: + Returns: + The transfer plan + """ +- # Compute offsets if not cached +- if request_info.transfer_offset is None: ++ # Compute offsets per distinct layer geometry. Heterogeneous-KV models ++ # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8 ++ # caches in one KV-cache group; caching a single offset set per request ++ # and reusing it for every layer corrupts the indexer cache. Block_ids ++ # are shared (single block table), so offsets depend only on the layer's ++ # shape/stride/dtype -- memoize by that geometry key. ++ layer_cache = self.worker.kv_caches[task.layer_name] ++ geom_key = ( ++ tuple(layer_cache.shape), ++ tuple(layer_cache.stride()), ++ layer_cache.dtype, ++ ) ++ offsets = request_info.transfer_offsets.get(geom_key) ++ if offsets is None: + offsets = self.worker._compute_block_transfer_offsets( + task.layer_name, + task.local_block_ids, + request_info.block_ids, + remote_moriio_meta, + ) +- request_info.transfer_offset = offsets ++ request_info.transfer_offsets[geom_key] = offsets + + # Get session index + layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys()) + sess_idx = layer_names.index(task.layer_name) + +- local_off, remote_off, sizes = request_info.transfer_offset ++ local_off, remote_off, sizes = offsets + + return LayerTransferPlan( + request_id=task.request_id, From c1b19e26dde2b777b2271e0f2378af8e7fb64d5b Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Thu, 18 Jun 2026 18:22:44 -0400 Subject: [PATCH 11/20] disagg #1762: extend conc sweep to 32,64,128,256,512,1024 at 1k1k and 8k1k Widen the disagg sweep from conc 1,2,4,8,16 to 1,2,4,8,16,32,64,128,256,512,1024 for both seq-len scenarios (1P TP8 + 1D TP8). The 8k1k conc-16 point keeps the multi-node eval marked (eval-conc=16) so lm-eval still validates the MoRI-IO disagg pipeline. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/amd-master.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ad879f894..d419ad73f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2751,7 +2751,7 @@ minimaxm3-fp8-mi355x-vllm-disagg: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16 ] + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] prefill: num-worker: 1 tp: 8 @@ -2766,15 +2766,15 @@ minimaxm3-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc 1,2,4,8,16. The - # conc-16 point also makes the multi-node eval policy (8k1k + conc >= 16) mark - # an lm-eval (eval-conc=16) — validating the M3 MoRI-IO disagg pipeline's - # correctness end-to-end. + # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc + # 1,2,4,8,16,32,64,128,256,512,1024. The conc-16 point also makes the + # multi-node eval policy (8k1k + conc >= 16) mark an lm-eval (eval-conc=16) — + # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end. - isl: 8192 osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16 ] + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ] prefill: num-worker: 1 tp: 8 From d0a7844fbe34f7a54e0658e154a487b2d460d371 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Thu, 18 Jun 2026 21:25:13 -0400 Subject: [PATCH 12/20] disagg #1762: add TP4-prefill P/D layouts (TP4+TP8, TP4+TP4) at 1k1k and 8k1k Add two asymmetric prefill/decode layouts alongside the existing TP8+TP8 sweep, for both seq-len scenarios: - 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1..256 - 1P TP4 + 1D TP4 (balanced half-node) at conc 64..1024 Per-worker TP is driven by the master-config prefill/decode tp: server_vllm.sh sed-rewrites the models_vllm.yaml --tensor-parallel-size 8 placeholder to the computed PREFILL_TP_SIZE/DECODE_TP_SIZE, so no models_vllm.yaml flag change is needed (comment updated to say so). The multinode eval policy still marks exactly one lm-eval (groups by dp-attn, not TP) on the TP8+TP8 8k1k layout. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/amd-master.yaml | 79 ++++++++++++++++++- .../multi_node/amd_utils/models_vllm.yaml | 4 +- 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d419ad73f..aa4887ad0 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2766,10 +2766,46 @@ minimaxm3-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - # 8k1k disagg sweep (same 1P TP8 + 1D TP8 layout) across conc - # 1,2,4,8,16,32,64,128,256,512,1024. The conc-16 point also makes the - # multi-node eval policy (8k1k + conc >= 16) mark an lm-eval (eval-conc=16) — - # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end. + # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across + # conc 1,2,4,8,16,32,64,128,256. + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024. + - spec-decoding: "none" + conc-list: [ 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 8k1k disagg sweep across three P/D layouts (1P TP8 + 1D TP8 conc 1..1024; + # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024). The multi-node + # eval policy (8k1k + conc >= 16) marks one lm-eval on the highest-max-conc + # layout (TP8+TP8, eval-conc=median=128) — validating the M3 MoRI-IO disagg + # pipeline's correctness end-to-end. - isl: 8192 osl: 1024 search-space: @@ -2789,3 +2825,38 @@ minimaxm3-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across + # conc 1,2,4,8,16,32,64,128,256. + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024. + - spec-decoding: "none" + conc-list: [ 64, 128, 256, 512, 1024 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml index e78b6c647..a566fe449 100644 --- a/benchmarks/multi_node/amd_utils/models_vllm.yaml +++ b/benchmarks/multi_node/amd_utils/models_vllm.yaml @@ -44,7 +44,9 @@ gpt-oss-120b: env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0" MiniMax-M3-MXFP8: - # MiniMax-M3 MXFP8 disagg smoke test (TP8 prefill + TP8 decode, no EP). + # MiniMax-M3 MXFP8 disagg, no EP. The --tensor-parallel-size 8 below is just a + # placeholder: server_vllm.sh sed-rewrites it to PREFILL_TP_SIZE/DECODE_TP_SIZE + # from the master-config prefill/decode tp (the sweep mixes TP8 and TP4 layouts). # --block-size 128 is mandatory (MSA sparse/index cache); text-only benchmark # so --language-model-only frees the vision encoder. gfx950 uses FP8 KV cache. prefill_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice" From 5c06ea75b8e9cb70135378e9e5ce318c9b6c7847 Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Fri, 19 Jun 2026 11:36:16 +0000 Subject: [PATCH 13/20] feat(amd-disagg): bundle heterogeneous-TP + dup-ack fixes into unified MoRIIO diff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces moriio-kv-layout-fix.diff with moriio-minimax-m3-disagg.diff, which bundles three layered fixes for the stock minimax-m3 vLLM image: 1. KV-layout: axis-aware per-layer block offsets (the gsm8k 0.0008→0.958 fix, required for homogeneous TP too). 2. heterogeneous-TP addressing + guard: maps each decode rank to the correct prefill rank (tp_rank // ratio) for PREFILL_TP_SIZE != DECODE_TP_SIZE, and raises NotImplementedError for unsupported cases (prefill-TP > decode-TP, KV-head splitting) instead of silently corrupting KV. 3. dup-ack fan-in: with DECODE_TP_SIZE > PREFILL_TP_SIZE, producer counts ACKs per transfer_id and only frees KV blocks once all expected consumers ACK, preventing both the late-ACK EngineCore crash and KV reuse before slower decode ranks finish reading. job.slurm and patches/README.md updated to reference the new diff name. Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/multi_node/amd_utils/job.slurm | 27 +- .../multi_node/amd_utils/patches/README.md | 37 +- .../patches/moriio/moriio-kv-layout-fix.diff | 218 -------- .../moriio/moriio-minimax-m3-disagg.diff | 479 ++++++++++++++++++ 4 files changed, 535 insertions(+), 226 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff create mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 727f64632..1a546b361 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -97,14 +97,35 @@ fi # diff needs no extra mount -- just an in-container `patch` call. A failed # apply aborts the container: silently running unpatched would silently # corrupt accuracy, not just skip a feature. -_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff" +# +# The single diff bundles three layered fixes (all in patches/moriio/, +# see patches/README.md): +# 1. KV-layout: the load-bearing accuracy fix (axis-aware per-layer offsets; +# gsm8k 0.0008 -> 0.958). Required for homogeneous TP too. +# 2. heterogeneous-TP (no-op for homogeneous TP, required for +# PREFILL_TP_SIZE != DECODE_TP_SIZE -- see nvidia/amd-master.yaml's TP4+TP8 +# configs): handshake/notify port addressing maps each decode rank to the +# correct prefill rank instead of its own raw tp_rank (stock +# MoRIIOConnector has no fan-out concept at all), and guards (fail loud) +# the KV-head-split / prefill-TP>decode-TP cases MoRIIO can't serve. +# 3. dup-ack: with DECODE_TP_SIZE > PREFILL_TP_SIZE, N decode ranks fan in to +# 1 prefill rank and each sends its own completion ack for the same +# transfer_id. Freeing KV blocks on the first ack (the original +# MoRIIOConnector behavior) both crashes EngineCore on the late second ack +# (AssertionError in Scheduler._update_from_kv_xfer_finished) and risks +# silently corrupting KV if the slower decode rank's read is still in +# flight when the blocks are reused. Fix mirrors NIXL's +# consumer_notification_counts_by_req: producer counts acks per +# transfer_id (consumer embeds its own tp_size in the notify message) and +# only frees once all expected consumers have acked. +_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff" _MORIIO_PATCH_CMD="" if [[ "${MORIIO_KV_PATCH:-auto}" != "skip" ]] \ && [[ -f "$_MORIIO_DIFF" ]] \ && [[ "${DOCKER_IMAGE_NAME:-}" == *"minimax-m3"* ]] \ && [[ "${DOCKER_IMAGE_NAME:-}" != *"hetkv"* ]]; then - _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff || exit 1" - echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout diff inside container: ${_MORIIO_DIFF}" + _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff || exit 1" + echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout + heterogeneous-TP + dup-ack diff inside container: ${_MORIIO_DIFF}" fi xP="${xP:-1}" diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md index 27f9fc81d..a75f38854 100644 --- a/benchmarks/multi_node/amd_utils/patches/README.md +++ b/benchmarks/multi_node/amd_utils/patches/README.md @@ -84,8 +84,9 @@ container behavior is byte-identical to the unpatched path. ## `moriio/` (vLLM MoRIIO connector, MiniMax-M3) -A unified diff (`moriio-kv-layout-fix.diff`), applied with `patch -p1` -against the vLLM package dir inside the container, touching three files: +A single unified diff (`moriio-minimax-m3-disagg.diff`), applied with +`patch -p1` against the vLLM package dir inside the container, touching +three files: ``` /usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/moriio/ @@ -117,10 +118,36 @@ indexer cache), `num_blocks = shape[0]`; the WRITE path memoizes offsets per geometry. Result: disagg gsm8k `strict-match 0.9583 / flexible-extract 0.9575` (matches single-node). Homogeneous models (uniform layout) are unaffected — one geometry, one offset set, same -result. Heterogeneous-TP P/D (prefill TP ≠ decode TP) is still a TODO -(same as upstream). Full write-up in +result. Full write-up in `/apps/ditian12/m3_disagg_manual/moriio_hetkv_fix/README.md`. +The diff also bundles two heterogeneous-TP layers (no-op for homogeneous +TP, exercised by `nvidia/amd-master.yaml`'s TP4-prefill + TP8-decode +configs): + +- **heterogeneous-TP addressing + guard:** stock MoRIIOConnector always + addresses remote rank == local `tp_rank`, which has no listener once + `DECODE_TP_SIZE > PREFILL_TP_SIZE`. `_remote_tp_rank` maps each decode + rank to the prefill rank holding its KV head (`tp_rank // ratio`, + mirroring NIXL's `TpKVTopology.get_target_remote_ranks`). This is + byte-correct only when KV heads are **replicated** (`tp_size >= + total_kv_heads`, i.e. ≤1 distinct head per rank — MiniMax-M3 has 4 KV + heads, so any TP≥4 is replicated). The cases MoRIIO can't serve — + prefill TP > decode TP (needs multi-rank fan-in) and KV-head splitting + (`total_kv_heads > prefill_tp`, which would need per-head slicing of the + NHD layout, unrepresentable as one `(offset,len)` per block) — now + **raise `NotImplementedError`** in `_compute_block_transfer_offsets` + instead of silently transferring corrupt KV. (NIXL likewise only splits + heads in HND layout and raises otherwise.) +- **dup-ack fan-in:** with `DECODE_TP_SIZE > PREFILL_TP_SIZE`, N decode + ranks read from one prefill rank and each ACKs the same `transfer_id`. + The producer now counts ACKs per `transfer_id` (consumer embeds its own + `tp_size` in the notify payload) and only reports `finished_sending` + once all expected consumers have ACKed — preventing both the late-ACK + `EngineCore` crash and freeing/reusing KV blocks while a slower decode + rank is still reading. Mirrors NIXL's + `consumer_notification_counts_by_req`. + ### How to enable `job.slurm` auto-applies this diff when `DOCKER_IMAGE_NAME` contains @@ -131,7 +158,7 @@ caller sets `MORIIO_KV_PATCH=skip`. To wire it by hand (e.g. the ```bash patch -p1 -d /usr/local/lib/python3.12/dist-packages \ - < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff + < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff ``` (`$DI_REPO_DIR` is the InferenceX checkout root that `job.slurm` already diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff deleted file mode 100644 index 7f6c435bf..000000000 --- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-kv-layout-fix.diff +++ /dev/null @@ -1,218 +0,0 @@ -diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py -index 73694ce32..a30d30af8 100644 ---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py -+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py -@@ -80,6 +80,10 @@ class RemoteAllocInfo: - writes_done: int = 0 - decode_dp_rank: int = 0 - transfer_offset: tuple[list[int], list[int], list[int]] | None = None -+ # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for -+ # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a -+ # single entry. See MoRIIOWriter._prepare_transfer_plan. -+ transfer_offsets: dict = field(default_factory=dict) - - - class ROLE(Enum): -diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py -index 167eef6e1..1846a3c21 100644 ---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py -+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py -@@ -1233,8 +1233,10 @@ class MoRIIOConnectorWorker: - block_size, kv_latent_dim = block_shape - self.slot_size_bytes = kv_elem_size * kv_latent_dim - else: -- # [2 (k and v), num_blocks, ...] -- self.num_blocks = first_kv_cache.shape[1] -+ # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V -+ # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read -+ # shape[1] here, which is the size-2 K/V axis, not num_blocks.) -+ self.num_blocks = first_kv_cache.shape[0] - block_rank = 3 # [block_size, kv_heads, head_dim] - block_shape = first_kv_cache.shape[-block_rank:] - block_size, n_kv_heads, head_dim = block_shape[-3:] -@@ -1257,10 +1259,17 @@ class MoRIIOConnectorWorker: - caches_data = [] - - for cache_or_caches in kv_caches.values(): -- cache_list = [cache_or_caches] if use_mla else cache_or_caches -+ # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs -+ # rank-5 (full attention, [K, V]). A single global use_mla flag -+ # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for -+ # hybrid models, so detect per cache. region_len is the actual tensor -+ # (or K/V half) byte size -- equivalent to num_blocks * block_len for -+ # homogeneous models, correct for heterogeneous ones. -+ cache_is_mla = cache_or_caches.dim() == 3 -+ cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches - for cache in cache_list: - base_addr = cache.data_ptr() -- region_len = self.num_blocks * self.block_len -+ region_len = cache.numel() * cache.element_size() - caches_data.append((base_addr, region_len, cache.device.index, "")) - kv_caches_base_addr.append(base_addr) - -@@ -1665,21 +1674,53 @@ class MoRIIOConnectorWorker: - Tuple of (local_offsets, remote_offsets, transfer_sizes) - """ - assert self.kv_cache_shape is not None, "KV caches shape not initialized" -- is_mla = len(self.kv_cache_shape) == 3 -- stride = self.kv_caches[layer_name].stride() -- sz = self.kv_caches[layer_name].element_size() -- if is_mla: -- blknum, blksize, hs = self.kv_cache_shape -- hn = 1 -- block_stride = stride[0] -- else: -- _, blknum, blksize, hn, hs = self.kv_cache_shape -- local_ktov_stride = stride[0] -- block_stride = stride[1] -- remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks -+ # Per-layer, axis-aware geometry. -+ # -+ # The KV tensors vLLM hands the connector are laid out (verified on -+ # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1): -+ # * main attention (GQA, dense + sparse layers): -+ # shape (num_blocks, 2, block_size, num_kv_heads, head_dim) -+ # -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0. -+ # * sparse lightning-indexer (key-only, bf16): -+ # shape (num_blocks, block_size, head_dim) -- rank 3, no K/V axis. -+ # -+ # The legacy code assumed the FlashAttention-style [2, num_blocks, ...] -+ # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V -+ # stride) -- transposing block vs K/V so every block read the wrong -+ # region (corruption invisible to throughput-only benchmarks). Instead, -+ # detect the block axis (size == num_blocks) and the optional K/V axis -+ # (size 2) from THIS layer's own shape, and derive strides from them. The -+ # per-block stride is independent of num_blocks, so no remote-num_blocks -+ # scaling is needed (homogeneous P/D TP; heterogeneous TP still TODO). -+ layer_cache = self.kv_caches[layer_name] -+ layer_shape = tuple(layer_cache.shape) -+ stride = layer_cache.stride() -+ sz = layer_cache.element_size() -+ rank = len(layer_shape) -+ -+ # K/V axis = the size-2 axis among the two outermost dims (if any). -+ kv_axis: int | None = None -+ if rank >= 4: -+ if layer_shape[0] == 2: -+ kv_axis = 0 -+ elif layer_shape[1] == 2: -+ kv_axis = 1 -+ # Block axis = outermost non-K/V axis (the one indexed by block_id). -+ block_axis = 0 -+ if kv_axis == 0: -+ block_axis = 1 -+ block_stride = stride[block_axis] -+ kv_stride = stride[kv_axis] if kv_axis is not None else 0 -+ per_block = layer_shape[kv_axis] if kv_axis is not None else 1 # 2 (K,V) or 1 -+ -+ # One transferred slab = all dims except the block and K/V axes. -+ slot_elems = 1 -+ for ax in range(rank): -+ if ax == block_axis or ax == kv_axis: -+ continue -+ slot_elems *= layer_shape[ax] -+ transfer_size_byte = slot_elems * sz - -- transfer_size_byte = blksize * hn * hs * sz -- per_block = 1 if is_mla else 2 - total = len(local_block_ids) * per_block - offset_local = [0] * total - offset_remote = [0] * total -@@ -1688,17 +1729,9 @@ class MoRIIOConnectorWorker: - w = 0 - for i, lb in enumerate(local_block_ids): - rb = remote_block_ids[i] -- # K -- offset_local[w] = sz * (lb * block_stride) -- offset_remote[w] = sz * (rb * block_stride) -- w += 1 -- if not is_mla: -- # V -- # Handle num_block variations originating from PD (different kv strides) -- # TODO: address block_sz differences in heterogeneous TP scenarios -- # In MLA, we don't need to consider these two cases. -- offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride) -- offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride) -+ for kv in range(per_block): -+ offset_local[w] = sz * (lb * block_stride + kv * kv_stride) -+ offset_remote[w] = sz * (rb * block_stride + kv * kv_stride) - w += 1 - - merged_l, merged_r, merged_s = self.merge_contiguous_blocks( -@@ -1722,15 +1755,26 @@ class MoRIIOConnectorWorker: - dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0) - sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id) - -- first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0] -- offs = self._compute_block_transfer_offsets( -- first_layer, local_block_ids, remote_block_ids, remote_moriio_meta -- ) -- -- for layer_name in self.layer_name_to_local_kv_cache_metadata: -- sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( -- layer_name -+ # Heterogeneous-KV models register layers with different shapes/dtypes in -+ # a single KV-cache group sharing one block table, so block_ids match -+ # across layers but per-block byte geometry does not. Compute offsets per -+ # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3 -+ # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing. -+ layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys()) -+ offs_by_geom: dict = {} -+ for sess_idx, layer_name in enumerate(layer_names): -+ layer_cache = self.kv_caches[layer_name] -+ geom_key = ( -+ tuple(layer_cache.shape), -+ tuple(layer_cache.stride()), -+ layer_cache.dtype, - ) -+ offs = offs_by_geom.get(geom_key) -+ if offs is None: -+ offs = self._compute_block_transfer_offsets( -+ layer_name, local_block_ids, remote_block_ids, remote_moriio_meta -+ ) -+ offs_by_geom[geom_key] = offs - # TODO : apply multi-session batch-read when moriio support it - transfer_status = self.moriio_wrapper.read_remote_data( - offs[2], offs[0], offs[1], sessions[sess_idx] -diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py -index 3ca5f37ca..113eccad0 100644 ---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py -+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py -@@ -279,21 +279,33 @@ class MoRIIOWriter: - Returns: - The transfer plan - """ -- # Compute offsets if not cached -- if request_info.transfer_offset is None: -+ # Compute offsets per distinct layer geometry. Heterogeneous-KV models -+ # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8 -+ # caches in one KV-cache group; caching a single offset set per request -+ # and reusing it for every layer corrupts the indexer cache. Block_ids -+ # are shared (single block table), so offsets depend only on the layer's -+ # shape/stride/dtype -- memoize by that geometry key. -+ layer_cache = self.worker.kv_caches[task.layer_name] -+ geom_key = ( -+ tuple(layer_cache.shape), -+ tuple(layer_cache.stride()), -+ layer_cache.dtype, -+ ) -+ offsets = request_info.transfer_offsets.get(geom_key) -+ if offsets is None: - offsets = self.worker._compute_block_transfer_offsets( - task.layer_name, - task.local_block_ids, - request_info.block_ids, - remote_moriio_meta, - ) -- request_info.transfer_offset = offsets -+ request_info.transfer_offsets[geom_key] = offsets - - # Get session index - layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys()) - sess_idx = layer_names.index(task.layer_name) - -- local_off, remote_off, sizes = request_info.transfer_offset -+ local_off, remote_off, sizes = offsets - - return LayerTransferPlan( - request_id=task.request_id, diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff new file mode 100644 index 000000000..700cf26c3 --- /dev/null +++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff @@ -0,0 +1,479 @@ +--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py ++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py +@@ -80,6 +80,10 @@ + writes_done: int = 0 + decode_dp_rank: int = 0 + transfer_offset: tuple[list[int], list[int], list[int]] | None = None ++ # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for ++ # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a ++ # single entry. See MoRIIOWriter._prepare_transfer_plan. ++ transfer_offsets: dict = field(default_factory=dict) + + + class ROLE(Enum): +--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py ++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +@@ -740,6 +740,21 @@ + # Completions that arrived before transfer_id_to_request_id was populated. + # Retried each step until the mapping is established. + self._unmatched_write_completions: set[str] = set() ++ # Producer side: with heterogeneous-TP P/D (DECODE_TP_SIZE > ++ # PREFILL_TP_SIZE), more than one decode rank reads this rank's KV ++ # slice and each sends its own completion notify for the SAME ++ # transfer_id once its own read finishes. We must not report ++ # finished_sending (which lets the core scheduler free/reuse the ++ # blocks) until *all* of them have acked -- otherwise a still-in- ++ # flight slower reader can read corrupted/reused memory. This counts ++ # per-transfer_id notifies against the expected fan-in count (derived ++ # from the consumer's own tp_size, sent alongside the transfer_id -- ++ # see send_notify call sites) and only resolves once it's complete. ++ # Mirrors NIXL's consumer_notification_counts_by_req. Pruned in ++ # start_load_kv() once a transfer_id drops out of the live mapping ++ # (e.g. force-freed by the scheduler's defer_timeout without ever ++ # reaching full count). ++ self._consumer_notification_counts: dict[str, int] = {} + + role = "producer" if self.is_producer else "consumer" + engine_suffix = ( +@@ -1085,6 +1100,30 @@ + req_id.decode(), + ) + ++ def _remote_tp_rank(self, remote_tp_size: int) -> int: ++ """Map this worker's local tp_rank to the remote tp_rank it must ++ address when local and remote TP sizes differ (heterogeneous-TP P/D, ++ e.g. PREFILL_TP_SIZE=4 / DECODE_TP_SIZE=8). ++ ++ vLLM replicates KV heads across TP ranks in groups of size ++ local_tp_size/remote_tp_size (see ModelConfig.get_num_kv_heads: ++ max(1, total_kv_heads // tp_size)), so every local rank within a ++ replica group must address the SAME single remote rank -- ++ floor(local_tp_rank / ratio) -- instead of its own raw tp_rank, which ++ has no listener once local tp_size > remote tp_size. Mirrors vLLM's ++ NIXL connector (TpKVTopology.get_target_remote_ranks). The reverse ++ case (remote tp_size > local tp_size, e.g. P-TP > D-TP) would need ++ multi-rank fan-in reads and is not handled here. ++ """ ++ if remote_tp_size == self.world_size: ++ return self.tp_rank ++ assert self.world_size % remote_tp_size == 0, ( ++ f"local tp_size {self.world_size} must be a multiple of remote " ++ f"tp_size {remote_tp_size} for heterogeneous-TP P/D (remote " ++ "tp_size > local tp_size is not supported)" ++ ) ++ return self.tp_rank // (self.world_size // remote_tp_size) ++ + def _moriio_handshake( + self, + host: str, +@@ -1101,7 +1140,9 @@ + # a hack to keep us moving. We will switch when moving to etcd + # or where we have a single ZMQ socket in the scheduler. + +- port_offset = get_port_offset(remote_dp_rank, self.tp_rank) ++ port_offset = get_port_offset( ++ remote_dp_rank, self._remote_tp_rank(remote_tp_size) ++ ) + path = make_zmq_path("tcp", host, port + port_offset) + logger.debug("handshake Querying metadata on path: %s", path) + +@@ -1233,8 +1274,10 @@ + block_size, kv_latent_dim = block_shape + self.slot_size_bytes = kv_elem_size * kv_latent_dim + else: +- # [2 (k and v), num_blocks, ...] +- self.num_blocks = first_kv_cache.shape[1] ++ # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V ++ # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read ++ # shape[1] here, which is the size-2 K/V axis, not num_blocks.) ++ self.num_blocks = first_kv_cache.shape[0] + block_rank = 3 # [block_size, kv_heads, head_dim] + block_shape = first_kv_cache.shape[-block_rank:] + block_size, n_kv_heads, head_dim = block_shape[-3:] +@@ -1257,10 +1300,17 @@ + caches_data = [] + + for cache_or_caches in kv_caches.values(): +- cache_list = [cache_or_caches] if use_mla else cache_or_caches ++ # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs ++ # rank-5 (full attention, [K, V]). A single global use_mla flag ++ # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for ++ # hybrid models, so detect per cache. region_len is the actual tensor ++ # (or K/V half) byte size -- equivalent to num_blocks * block_len for ++ # homogeneous models, correct for heterogeneous ones. ++ cache_is_mla = cache_or_caches.dim() == 3 ++ cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches + for cache in cache_list: + base_addr = cache.data_ptr() +- region_len = self.num_blocks * self.block_len ++ region_len = cache.numel() * cache.element_size() + caches_data.append((base_addr, region_len, cache.device.index, "")) + kv_caches_base_addr.append(base_addr) + +@@ -1338,13 +1388,45 @@ + done_sending, done_recving = set(), set() + + if self.is_producer: +- # pop_finished_req_ids returns transfer_ids (the ZMQ payload sent +- # by decode via send_notify); map back to req_ids for the scheduler. +- finished_transfer_ids = self.moriio_wrapper.pop_finished_req_ids() ++ # pop_finished_req_ids returns every completion message received ++ # since the last call (NOT deduped -- with heterogeneous-TP ++ # fan-out, two different decode ranks legitimately send ++ # byte-identical messages for the same transfer_id and each one ++ # must be counted). Payload is ":" ++ # (see send_notify call sites); plain transfer_id with no ":" is ++ # treated as a 1:1 ack (internal WRITE-mode completions). ++ finished_transfer_msgs = self.moriio_wrapper.pop_finished_req_ids() ++ resolved_transfer_ids: set[str] = set() ++ for raw_msg in finished_transfer_msgs: ++ xfer_id, _, tp_size_str = raw_msg.rpartition(":") ++ if not xfer_id: ++ xfer_id, tp_size_str = raw_msg, str(self.world_size) ++ if xfer_id not in self.transfer_id_to_request_id: ++ logger.warning( ++ "Could not find %s in transfer_id_to_request_id " ++ "lookup table. This could lead to a possible hang.", ++ xfer_id, ++ ) ++ continue ++ consumer_tp_size = int(tp_size_str) ++ if consumer_tp_size > self.world_size: ++ assert consumer_tp_size % self.world_size == 0, ( ++ f"consumer tp_size {consumer_tp_size} must be a " ++ f"multiple of producer tp_size {self.world_size} " ++ "for heterogeneous-TP P/D" ++ ) ++ expected_acks = consumer_tp_size // self.world_size ++ else: ++ expected_acks = 1 ++ count = self._consumer_notification_counts.get(xfer_id, 0) + 1 ++ if count >= expected_acks: ++ self._consumer_notification_counts.pop(xfer_id, None) ++ resolved_transfer_ids.add(xfer_id) ++ else: ++ self._consumer_notification_counts[xfer_id] = count + done_sending = { + self.transfer_id_to_request_id[xfer_id] +- for xfer_id in finished_transfer_ids +- if xfer_id in self.transfer_id_to_request_id ++ for xfer_id in resolved_transfer_ids + } + else: + if self.mode == MoRIIOMode.WRITE: +@@ -1389,7 +1471,13 @@ + if last.Succeeded(): + host, port, xfer_id = self._recving_transfers_callback_addr[req_id] + done_req_ids.add(xfer_id) +- self.moriio_wrapper.send_notify(xfer_id, host, port) ++ # Embed our own tp_size so the producer can tell, with ++ # heterogeneous-TP fan-out, how many consumer acks to ++ # expect for this transfer_id before it's safe to free ++ # the blocks (see _consumer_notification_counts). ++ self.moriio_wrapper.send_notify( ++ f"{xfer_id}:{self.world_size}", host, port ++ ) + to_remove.append(req_id) + elif last.Failed(): + logger.error( +@@ -1402,7 +1490,9 @@ + ) + host, port, xfer_id = self._recving_transfers_callback_addr[req_id] + try: +- self.moriio_wrapper.send_notify(xfer_id, host, port) ++ self.moriio_wrapper.send_notify( ++ f"{xfer_id}:{self.world_size}", host, port ++ ) + except Exception: + logger.exception( + "Failed to send error notification for request %s", +@@ -1488,6 +1578,15 @@ + """ + self.transfer_id_to_request_id = metadata.transfer_id_to_request_id + if self.is_producer: ++ # Drop counts for transfer_ids that dropped out of the live ++ # mapping without ever reaching full ack count (e.g. force-freed ++ # by the scheduler's defer_timeout) -- they can never resolve via ++ # get_finished() anymore, so stop tracking them to bound memory. ++ self._consumer_notification_counts = { ++ xfer_id: count ++ for xfer_id, count in self._consumer_notification_counts.items() ++ if xfer_id in self.transfer_id_to_request_id ++ } + self.moriio_wrapper.async_wait_reqid() + return + if self.mode == MoRIIOMode.WRITE: +@@ -1560,6 +1659,7 @@ + remote_block_ids=meta.remote_block_ids, + remote_host=meta.remote_host, + remote_notify_port=meta.remote_notify_port, ++ remote_tp_size=meta.tp_size, + ) + + def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_layer): +@@ -1653,6 +1753,7 @@ + local_block_ids: list[int], + remote_block_ids: list[int], + remote_moriio_meta: MoRIIOAgentMetadata, ++ remote_tp_size: int | None = None, + ) -> tuple[list[int], list[int], list[int]]: + """Compute transfer offsets for block data. + +@@ -1661,25 +1762,110 @@ + local_block_ids: IDs of local blocks + remote_block_ids: IDs of remote blocks + remote_moriio_meta: Metadata of the remote MoRIIO agent ++ remote_tp_size: tp_size of the remote (producer/prefill) instance. ++ Defaults to this worker's world_size (homogeneous P/D TP). When ++ it differs, used to validate that KV heads are replicated (the ++ only heterogeneous-TP regime MoRIIO supports) -- see the guard ++ below. + Returns: + Tuple of (local_offsets, remote_offsets, transfer_sizes) + """ + assert self.kv_cache_shape is not None, "KV caches shape not initialized" +- is_mla = len(self.kv_cache_shape) == 3 +- stride = self.kv_caches[layer_name].stride() +- sz = self.kv_caches[layer_name].element_size() +- if is_mla: +- blknum, blksize, hs = self.kv_cache_shape +- hn = 1 +- block_stride = stride[0] +- else: +- _, blknum, blksize, hn, hs = self.kv_cache_shape +- local_ktov_stride = stride[0] +- block_stride = stride[1] +- remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks ++ # Per-layer, axis-aware geometry. ++ # ++ # The KV tensors vLLM hands the connector are laid out (verified on ++ # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1): ++ # * main attention (GQA, dense + sparse layers): ++ # shape (num_blocks, 2, block_size, num_kv_heads, head_dim) ++ # -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0. ++ # * sparse lightning-indexer (key-only, bf16): ++ # shape (num_blocks, block_size, head_dim) -- rank 3, no K/V axis. ++ # ++ # The legacy code assumed the FlashAttention-style [2, num_blocks, ...] ++ # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V ++ # stride) -- transposing block vs K/V so every block read the wrong ++ # region (corruption invisible to throughput-only benchmarks). Instead, ++ # detect the block axis (size == num_blocks) and the optional K/V axis ++ # (size 2) from THIS layer's own shape, and derive strides from them. The ++ # per-block stride is independent of num_blocks, so no remote-num_blocks ++ # scaling is needed. ++ layer_cache = self.kv_caches[layer_name] ++ layer_shape = tuple(layer_cache.shape) ++ stride = layer_cache.stride() ++ sz = layer_cache.element_size() ++ rank = len(layer_shape) ++ ++ # K/V axis = the size-2 axis among the two outermost dims (if any). ++ kv_axis: int | None = None ++ if rank >= 4: ++ if layer_shape[0] == 2: ++ kv_axis = 0 ++ elif layer_shape[1] == 2: ++ kv_axis = 1 ++ # Block axis = outermost non-K/V axis (the one indexed by block_id). ++ block_axis = 0 ++ if kv_axis == 0: ++ block_axis = 1 ++ block_stride = stride[block_axis] ++ kv_stride = stride[kv_axis] if kv_axis is not None else 0 ++ per_block = layer_shape[kv_axis] if kv_axis is not None else 1 # 2 (K,V) or 1 ++ ++ # One transferred slab = all dims except the block and K/V axes. ++ slot_elems = 1 ++ for ax in range(rank): ++ if ax == block_axis or ax == kv_axis: ++ continue ++ slot_elems *= layer_shape[ax] ++ ++ # --- Heterogeneous-TP guard (mirrors NIXL add_remote_agent) ----------- ++ # When local (decode) and remote (prefill) tp_size differ, _remote_tp_rank ++ # maps each local rank to the single remote rank it reads from. That ++ # whole-block read is byte-correct *only* when KV heads are REPLICATED, ++ # i.e. the remote rank holds exactly the head(s) this rank owns and no ++ # more. vLLM replicates KV heads whenever tp_size >= total_kv_heads ++ # (ModelConfig.get_num_kv_heads -> max(1, total//tp)); with the ++ # r // tp_ratio rank mapping, head ownership then lines up exactly, so ++ # no head offset is needed. This is MiniMax-M3's regime (4 KV heads, ++ # TP>=4) and is the only heterogeneous-TP case MoRIIO supports. ++ # ++ # If instead a remote rank packs MORE distinct KV heads than this local ++ # rank owns (total_kv_heads > remote_tp, i.e. heads are SPLIT on the ++ # producer), each fan-in rank would have to read only its head slice of ++ # the remote block. MoRIIO's per-block tensors are NHD ++ # ([block_size, kv_heads, head_dim] -- heads interleaved per token), so ++ # a head slice is NOT a contiguous sub-region and cannot be expressed as ++ # a single (offset, len) per block. NIXL only supports head splitting in ++ # HND layout and raises otherwise; we do the same -- fail loud rather ++ # than silently corrupt KV. MLA / rank-3 indexer caches are always ++ # replicated (no K/V axis) and never hit this path. ++ local_tp = self.world_size ++ remote_tp = remote_tp_size if remote_tp_size is not None else local_tp ++ if remote_tp != local_tp and not self.use_mla and kv_axis is not None: ++ total_kv_heads = self.model_config.get_total_num_kv_heads() ++ if remote_tp > local_tp: ++ # Prefill TP > decode TP: this rank would need to fan IN reads ++ # from multiple remote ranks (NIXL's negative-tp_ratio path). ++ raise NotImplementedError( ++ f"Heterogeneous-TP with remote (prefill) tp_size {remote_tp} " ++ f"> local (decode) tp_size {local_tp} requires multi-rank " ++ "fan-in reads, not supported by MoRIIOConnector." ++ ) ++ remote_heads = max(1, total_kv_heads // remote_tp) ++ local_heads = max(1, total_kv_heads // local_tp) ++ if remote_heads > local_heads: ++ # KV heads are split (not replicated) on the producer -> would ++ # need NHD head slicing, which MoRIIO can't express per block. ++ raise NotImplementedError( ++ f"Heterogeneous-TP head splitting (total_kv_heads " ++ f"{total_kv_heads} > prefill tp_size {remote_tp}: " ++ f"{remote_heads} heads/rank on prefill vs {local_heads} on " ++ "decode) requires per-head slicing of an NHD KV layout, not " ++ "supported by MoRIIOConnector. Use PREFILL_TP_SIZE >= " ++ "total_kv_heads so KV heads are replicated." ++ ) ++ ++ transfer_size_byte = slot_elems * sz + +- transfer_size_byte = blksize * hn * hs * sz +- per_block = 1 if is_mla else 2 + total = len(local_block_ids) * per_block + offset_local = [0] * total + offset_remote = [0] * total +@@ -1688,17 +1874,9 @@ + w = 0 + for i, lb in enumerate(local_block_ids): + rb = remote_block_ids[i] +- # K +- offset_local[w] = sz * (lb * block_stride) +- offset_remote[w] = sz * (rb * block_stride) +- w += 1 +- if not is_mla: +- # V +- # Handle num_block variations originating from PD (different kv strides) +- # TODO: address block_sz differences in heterogeneous TP scenarios +- # In MLA, we don't need to consider these two cases. +- offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride) +- offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride) ++ for kv in range(per_block): ++ offset_local[w] = sz * (lb * block_stride + kv * kv_stride) ++ offset_remote[w] = sz * (rb * block_stride + kv * kv_stride) + w += 1 + + merged_l, merged_r, merged_s = self.merge_contiguous_blocks( +@@ -1715,6 +1893,7 @@ + transfer_id: str, + remote_host: str, + remote_notify_port: int, ++ remote_tp_size: int, + ) -> None: + if self.mode == MoRIIOMode.WRITE: + return +@@ -1722,15 +1901,30 @@ + dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0) + sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id) + +- first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0] +- offs = self._compute_block_transfer_offsets( +- first_layer, local_block_ids, remote_block_ids, remote_moriio_meta +- ) +- +- for layer_name in self.layer_name_to_local_kv_cache_metadata: +- sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( +- layer_name ++ # Heterogeneous-KV models register layers with different shapes/dtypes in ++ # a single KV-cache group sharing one block table, so block_ids match ++ # across layers but per-block byte geometry does not. Compute offsets per ++ # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3 ++ # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing. ++ layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys()) ++ offs_by_geom: dict = {} ++ for sess_idx, layer_name in enumerate(layer_names): ++ layer_cache = self.kv_caches[layer_name] ++ geom_key = ( ++ tuple(layer_cache.shape), ++ tuple(layer_cache.stride()), ++ layer_cache.dtype, + ) ++ offs = offs_by_geom.get(geom_key) ++ if offs is None: ++ offs = self._compute_block_transfer_offsets( ++ layer_name, ++ local_block_ids, ++ remote_block_ids, ++ remote_moriio_meta, ++ remote_tp_size=remote_tp_size, ++ ) ++ offs_by_geom[geom_key] = offs + # TODO : apply multi-session batch-read when moriio support it + transfer_status = self.moriio_wrapper.read_remote_data( + offs[2], offs[0], offs[1], sessions[sess_idx] +@@ -1739,6 +1933,6 @@ + self._recving_transfers[request_id].append(transfer_status) + self._recving_transfers_callback_addr[request_id] = ( + remote_host, +- str(remote_notify_port + self.tp_rank), ++ str(remote_notify_port + self._remote_tp_rank(remote_tp_size)), + transfer_id, + ) +--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py ++++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py +@@ -279,21 +279,33 @@ + Returns: + The transfer plan + """ +- # Compute offsets if not cached +- if request_info.transfer_offset is None: ++ # Compute offsets per distinct layer geometry. Heterogeneous-KV models ++ # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8 ++ # caches in one KV-cache group; caching a single offset set per request ++ # and reusing it for every layer corrupts the indexer cache. Block_ids ++ # are shared (single block table), so offsets depend only on the layer's ++ # shape/stride/dtype -- memoize by that geometry key. ++ layer_cache = self.worker.kv_caches[task.layer_name] ++ geom_key = ( ++ tuple(layer_cache.shape), ++ tuple(layer_cache.stride()), ++ layer_cache.dtype, ++ ) ++ offsets = request_info.transfer_offsets.get(geom_key) ++ if offsets is None: + offsets = self.worker._compute_block_transfer_offsets( + task.layer_name, + task.local_block_ids, + request_info.block_ids, + remote_moriio_meta, + ) +- request_info.transfer_offset = offsets ++ request_info.transfer_offsets[geom_key] = offsets + + # Get session index + layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys()) + sess_idx = layer_names.index(task.layer_name) + +- local_off, remote_off, sizes = request_info.transfer_offset ++ local_off, remote_off, sizes = offsets + + return LayerTransferPlan( + request_id=task.request_id, +@@ -671,9 +683,14 @@ + raise + + def pop_finished_req_ids(self): +- # producer invocation: get the set of completed requests at the decode ++ # Producer invocation: get all completion messages received since the ++ # last call. Returned as a list, NOT deduped -- with heterogeneous-TP ++ # fan-out, two different decode ranks can send byte-identical ++ # messages for the same transfer_id, and the caller (get_finished()) ++ # needs to count every individual occurrence to know when all ++ # expected consumers have acked. + with self.lock: +- done_send = set(self.done_req_ids) ++ done_send = list(self.done_req_ids) + self.done_req_ids = [] + return done_send + From 79d137de418b31f7c6a2ee8268e54cb32e02fb6b Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Fri, 19 Jun 2026 12:47:51 +0000 Subject: [PATCH 14/20] fix(moriio): correct _remote_tp_rank for prefill-TP > decode-TP (P8/D4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With P8/D4 and 4 KV heads, vLLM distributes heads across prefill ranks in consecutive pairs: (rank0,rank1)→head0, (rank2,rank3)→head1, etc. The previous patch used `return self.tp_rank` for the P>D branch, which made decode rank 1 connect to prefill rank 1 (holds head0) instead of prefill rank 2 (holds head1) — corrupting KV for all decode ranks except 0. Fix: use `self.tp_rank * ratio` (ratio = remote_tp_size // local_tp_size), the symmetric counterpart to the D>P case's `tp_rank // ratio`. This maps each decode rank to the *first* prefill rank of its head group, which holds the correct KV content via vLLM's replication scheme. Co-Authored-By: Claude Sonnet 4.6 --- .../multi_node/amd_utils/patches/README.md | 29 +++-- .../moriio/moriio-minimax-m3-disagg.diff | 112 +++++++++--------- 2 files changed, 76 insertions(+), 65 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md index a75f38854..4941540e3 100644 --- a/benchmarks/multi_node/amd_utils/patches/README.md +++ b/benchmarks/multi_node/amd_utils/patches/README.md @@ -128,17 +128,24 @@ configs): - **heterogeneous-TP addressing + guard:** stock MoRIIOConnector always addresses remote rank == local `tp_rank`, which has no listener once `DECODE_TP_SIZE > PREFILL_TP_SIZE`. `_remote_tp_rank` maps each decode - rank to the prefill rank holding its KV head (`tp_rank // ratio`, - mirroring NIXL's `TpKVTopology.get_target_remote_ranks`). This is - byte-correct only when KV heads are **replicated** (`tp_size >= - total_kv_heads`, i.e. ≤1 distinct head per rank — MiniMax-M3 has 4 KV - heads, so any TP≥4 is replicated). The cases MoRIIO can't serve — - prefill TP > decode TP (needs multi-rank fan-in) and KV-head splitting - (`total_kv_heads > prefill_tp`, which would need per-head slicing of the - NHD layout, unrepresentable as one `(offset,len)` per block) — now - **raise `NotImplementedError`** in `_compute_block_transfer_offsets` - instead of silently transferring corrupt KV. (NIXL likewise only splits - heads in HND layout and raises otherwise.) + rank to the correct single prefill rank. Two regimes, both requiring + **replicated** KV heads (`tp_size >= total_kv_heads`, ≤1 distinct head + per rank — MiniMax-M3 has 4 KV heads, so any TP≥4 is replicated): + - `D-TP > P-TP` (e.g. P4/D8): `tp_rank // ratio`, mirroring NIXL's + `TpKVTopology.get_target_remote_ranks`. Multiple decode ranks read + from one prefill rank. + - `P-TP > D-TP` (e.g. P8/D4): vLLM distributes heads across prefill + ranks in consecutive pairs — (rank0,rank1)→head0, (rank2,rank3)→head1, + etc. Decode rank k must connect to the **first** rank of its head group: + `tp_rank * ratio`. Using `tp_rank` directly (as the original patch did) + is wrong for ranks > 0: decode rank 1 lands on prefill rank 1 (holds + head0) instead of prefill rank 2 (holds head1), producing garbage KV. + The one unsupported case — KV-head **splitting** (`total_kv_heads > + prefill_tp`, where each prefill rank holds a distinct head subset that + a decode rank would need to slice from NHD layout, unrepresentable as a + single `(offset,len)` per block) — **raises `NotImplementedError`** in + `_compute_block_transfer_offsets`. (NIXL likewise only splits heads in + HND layout and raises otherwise.) - **dup-ack fan-in:** with `DECODE_TP_SIZE > PREFILL_TP_SIZE`, N decode ranks read from one prefill rank and each ACKs the same `transfer_id`. The producer now counts ACKs per `transfer_id` (consumer embeds its own diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff index 700cf26c3..4835397b0 100644 --- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff +++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff @@ -35,38 +35,52 @@ role = "producer" if self.is_producer else "consumer" engine_suffix = ( -@@ -1085,6 +1100,30 @@ +@@ -1085,6 +1100,40 @@ req_id.decode(), ) + def _remote_tp_rank(self, remote_tp_size: int) -> int: -+ """Map this worker's local tp_rank to the remote tp_rank it must -+ address when local and remote TP sizes differ (heterogeneous-TP P/D, -+ e.g. PREFILL_TP_SIZE=4 / DECODE_TP_SIZE=8). ++ """Map this worker's local tp_rank to the single remote tp_rank it must ++ address when local and remote TP sizes differ (heterogeneous-TP P/D). + -+ vLLM replicates KV heads across TP ranks in groups of size -+ local_tp_size/remote_tp_size (see ModelConfig.get_num_kv_heads: -+ max(1, total_kv_heads // tp_size)), so every local rank within a -+ replica group must address the SAME single remote rank -- -+ floor(local_tp_rank / ratio) -- instead of its own raw tp_rank, which -+ has no listener once local tp_size > remote tp_size. Mirrors vLLM's -+ NIXL connector (TpKVTopology.get_target_remote_ranks). The reverse -+ case (remote tp_size > local tp_size, e.g. P-TP > D-TP) would need -+ multi-rank fan-in reads and is not handled here. ++ Two regimes (both require KV heads to be REPLICATED, not split -- see ++ guard in _compute_block_transfer_offsets): ++ ++ * decode-TP > prefill-TP (e.g. P4/D8): multiple decode ranks (in groups ++ of ratio = decode_tp // prefill_tp) share one prefill rank's KV slice. ++ floor(local_tp_rank / ratio) maps each decode rank to its prefill rank. ++ Mirrors NIXL TpKVTopology.get_target_remote_ranks. ++ * prefill-TP > decode-TP (e.g. P8/D4): vLLM distributes 4 KV heads ++ across 8 prefill ranks in consecutive pairs -- (rank0,rank1)→head0, ++ (rank2,rank3)→head1, etc. Each decode rank must address the FIRST rank ++ of its paired group: local_tp_rank * ratio (NOT the same-indexed rank, ++ which would land in the wrong head's group for ranks > 0). ++ Head-splitting is rejected in _compute_block_transfer_offsets. + """ + if remote_tp_size == self.world_size: + return self.tp_rank ++ if remote_tp_size > self.world_size: ++ # Prefill-TP > decode-TP (e.g. P8/D4, replicated KV heads). ++ # vLLM pairs prefill ranks per head: decode rank k must connect to ++ # the first prefill rank of its head group (k * ratio), NOT rank k. ++ # Example (P8/D4, 4 KV heads): decode rank 1 (head1) → prefill ++ # rank 2 (not rank 1, which holds head0 alongside rank 0). ++ assert remote_tp_size % self.world_size == 0, ( ++ f"remote tp_size {remote_tp_size} must be a multiple of local " ++ f"tp_size {self.world_size} for heterogeneous-TP P/D" ++ ) ++ return self.tp_rank * (remote_tp_size // self.world_size) ++ # Decode-TP > prefill-TP: floor-map multiple decode ranks to one prefill rank. + assert self.world_size % remote_tp_size == 0, ( + f"local tp_size {self.world_size} must be a multiple of remote " -+ f"tp_size {remote_tp_size} for heterogeneous-TP P/D (remote " -+ "tp_size > local tp_size is not supported)" ++ f"tp_size {remote_tp_size} for heterogeneous-TP P/D" + ) + return self.tp_rank // (self.world_size // remote_tp_size) + def _moriio_handshake( self, host: str, -@@ -1101,7 +1140,9 @@ +@@ -1101,7 +1150,9 @@ # a hack to keep us moving. We will switch when moving to etcd # or where we have a single ZMQ socket in the scheduler. @@ -77,7 +91,7 @@ path = make_zmq_path("tcp", host, port + port_offset) logger.debug("handshake Querying metadata on path: %s", path) -@@ -1233,8 +1274,10 @@ +@@ -1233,8 +1284,10 @@ block_size, kv_latent_dim = block_shape self.slot_size_bytes = kv_elem_size * kv_latent_dim else: @@ -90,7 +104,7 @@ block_rank = 3 # [block_size, kv_heads, head_dim] block_shape = first_kv_cache.shape[-block_rank:] block_size, n_kv_heads, head_dim = block_shape[-3:] -@@ -1257,10 +1300,17 @@ +@@ -1257,10 +1310,17 @@ caches_data = [] for cache_or_caches in kv_caches.values(): @@ -110,7 +124,7 @@ caches_data.append((base_addr, region_len, cache.device.index, "")) kv_caches_base_addr.append(base_addr) -@@ -1338,13 +1388,45 @@ +@@ -1338,13 +1398,45 @@ done_sending, done_recving = set(), set() if self.is_producer: @@ -161,7 +175,7 @@ } else: if self.mode == MoRIIOMode.WRITE: -@@ -1389,7 +1471,13 @@ +@@ -1389,7 +1481,13 @@ if last.Succeeded(): host, port, xfer_id = self._recving_transfers_callback_addr[req_id] done_req_ids.add(xfer_id) @@ -176,7 +190,7 @@ to_remove.append(req_id) elif last.Failed(): logger.error( -@@ -1402,7 +1490,9 @@ +@@ -1402,7 +1500,9 @@ ) host, port, xfer_id = self._recving_transfers_callback_addr[req_id] try: @@ -187,7 +201,7 @@ except Exception: logger.exception( "Failed to send error notification for request %s", -@@ -1488,6 +1578,15 @@ +@@ -1488,6 +1588,15 @@ """ self.transfer_id_to_request_id = metadata.transfer_id_to_request_id if self.is_producer: @@ -203,7 +217,7 @@ self.moriio_wrapper.async_wait_reqid() return if self.mode == MoRIIOMode.WRITE: -@@ -1560,6 +1659,7 @@ +@@ -1560,6 +1669,7 @@ remote_block_ids=meta.remote_block_ids, remote_host=meta.remote_host, remote_notify_port=meta.remote_notify_port, @@ -211,7 +225,7 @@ ) def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_layer): -@@ -1653,6 +1753,7 @@ +@@ -1653,6 +1763,7 @@ local_block_ids: list[int], remote_block_ids: list[int], remote_moriio_meta: MoRIIOAgentMetadata, @@ -219,7 +233,7 @@ ) -> tuple[list[int], list[int], list[int]]: """Compute transfer offsets for block data. -@@ -1661,25 +1762,110 @@ +@@ -1661,25 +1772,100 @@ local_block_ids: IDs of local blocks remote_block_ids: IDs of remote blocks remote_moriio_meta: Metadata of the remote MoRIIO agent @@ -291,43 +305,31 @@ + slot_elems *= layer_shape[ax] + + # --- Heterogeneous-TP guard (mirrors NIXL add_remote_agent) ----------- -+ # When local (decode) and remote (prefill) tp_size differ, _remote_tp_rank -+ # maps each local rank to the single remote rank it reads from. That -+ # whole-block read is byte-correct *only* when KV heads are REPLICATED, -+ # i.e. the remote rank holds exactly the head(s) this rank owns and no -+ # more. vLLM replicates KV heads whenever tp_size >= total_kv_heads -+ # (ModelConfig.get_num_kv_heads -> max(1, total//tp)); with the -+ # r // tp_ratio rank mapping, head ownership then lines up exactly, so -+ # no head offset is needed. This is MiniMax-M3's regime (4 KV heads, -+ # TP>=4) and is the only heterogeneous-TP case MoRIIO supports. ++ # When P/D TP sizes differ, _remote_tp_rank maps each decode rank to a ++ # single remote rank; that whole-block read is byte-correct only when KV ++ # heads are REPLICATED on the remote (prefill) side. ++ # ++ # Supported regimes (replicated heads, i.e. remote_heads <= local_heads): ++ # * D-TP > P-TP (e.g. P4/D8): multiple decode ranks share one prefill ++ # rank's slice (floor-ratio mapping). ++ # * P-TP > D-TP (e.g. P8/D4): each decode rank reads from same-indexed ++ # prefill rank (self.tp_rank mapping). MiniMax-M3's regime: 4 KV heads ++ # fully replicated at TP>=4. + # -+ # If instead a remote rank packs MORE distinct KV heads than this local -+ # rank owns (total_kv_heads > remote_tp, i.e. heads are SPLIT on the -+ # producer), each fan-in rank would have to read only its head slice of -+ # the remote block. MoRIIO's per-block tensors are NHD -+ # ([block_size, kv_heads, head_dim] -- heads interleaved per token), so -+ # a head slice is NOT a contiguous sub-region and cannot be expressed as -+ # a single (offset, len) per block. NIXL only supports head splitting in -+ # HND layout and raises otherwise; we do the same -- fail loud rather -+ # than silently corrupt KV. MLA / rank-3 indexer caches are always -+ # replicated (no K/V axis) and never hit this path. ++ # Unsupported: heads SPLIT on prefill (remote_heads > local_heads). ++ # MoRIIO's NHD layout (heads interleaved per token) makes a head slice ++ # non-contiguous and inexpressible as a single (offset, len) per block. ++ # NIXL raises for the same reason; we do the same. MLA / rank-3 indexer ++ # caches are always replicated (no K/V axis) and bypass this guard. + local_tp = self.world_size + remote_tp = remote_tp_size if remote_tp_size is not None else local_tp + if remote_tp != local_tp and not self.use_mla and kv_axis is not None: + total_kv_heads = self.model_config.get_total_num_kv_heads() -+ if remote_tp > local_tp: -+ # Prefill TP > decode TP: this rank would need to fan IN reads -+ # from multiple remote ranks (NIXL's negative-tp_ratio path). -+ raise NotImplementedError( -+ f"Heterogeneous-TP with remote (prefill) tp_size {remote_tp} " -+ f"> local (decode) tp_size {local_tp} requires multi-rank " -+ "fan-in reads, not supported by MoRIIOConnector." -+ ) + remote_heads = max(1, total_kv_heads // remote_tp) + local_heads = max(1, total_kv_heads // local_tp) + if remote_heads > local_heads: -+ # KV heads are split (not replicated) on the producer -> would -+ # need NHD head slicing, which MoRIIO can't express per block. ++ # KV heads are SPLIT on prefill -- whole-block read is incorrect. ++ # Applies in both TP-mismatch directions; fail loud. + raise NotImplementedError( + f"Heterogeneous-TP head splitting (total_kv_heads " + f"{total_kv_heads} > prefill tp_size {remote_tp}: " @@ -336,6 +338,8 @@ + "supported by MoRIIOConnector. Use PREFILL_TP_SIZE >= " + "total_kv_heads so KV heads are replicated." + ) ++ # remote_heads <= local_heads: replicated. _remote_tp_rank selects the ++ # correct remote rank; whole-block read is byte-correct. + + transfer_size_byte = slot_elems * sz From db261e0fbb76a9704b2220ad9d8c14ccd3aa8a69 Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Fri, 19 Jun 2026 13:03:09 +0000 Subject: [PATCH 15/20] fix(moriio-diff): correct hunk header count after _remote_tp_rank expansion The P>D fix added 4 lines to _remote_tp_rank but the hunk header still said +1100,40; patch aborted with "malformed patch at line 79". Update to +1100,44 to match the actual 6 context + 38 added lines. Co-Authored-By: Claude Sonnet 4.6 --- .../amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff index 4835397b0..83ae80d13 100644 --- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff +++ b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff @@ -35,7 +35,7 @@ role = "producer" if self.is_producer else "consumer" engine_suffix = ( -@@ -1085,6 +1100,40 @@ +@@ -1085,6 +1100,44 @@ req_id.decode(), ) From 09efb99c17f07aebfd93f96bbda91dd99557a77c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Fri, 19 Jun 2026 14:38:47 +0000 Subject: [PATCH 16/20] fix(amd-disagg): keep MoRIIO patch cmd inside container bash -lc quotes The MoRIIO KV-layout patch was injected into the per-node container launch via '"${_MORIIO_PATCH_CMD:-}"', which breaks out of the outer srun bash -c "..." double-quoted string. Because the patch command value contains spaces and the shell operators '<' and '||', the unquoted expansion word-split the generated container script, truncating it right after the word `patch` and silently dropping the patch arguments AND the server.sh launch. The container then exited 0:0 within seconds, producing no benchmark/eval output -> collect_latest_results found "No logs directory" -> the launch step failed with exit 1 (all minimax-m3 disagg jobs affected). Fix: expand ${_MORIIO_PATCH_CMD:-} directly inside the inner bash -lc single quotes (no quote toggling), so the patch command stays intact and its operators are parsed by the container shell. Validated end-to-end: gsm8k recovers from ~0 (garbage) to 0.94-0.98 across P8D8/P4D8/P8D4. Co-authored-by: Cursor --- benchmarks/multi_node/amd_utils/job.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 1a546b361..3afa103f2 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -640,7 +640,7 @@ fi \"$DOCKER_IMAGE_NAME\" bash -lc ' set -o pipefail mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' - '"${_MORIIO_PATCH_CMD:-}"' + ${_MORIIO_PATCH_CMD:-} '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log ' From aad872a4c69482d702017d337d5bffee539b50f7 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Fri, 19 Jun 2026 14:23:59 -0400 Subject: [PATCH 17/20] disagg #1762: add 2P TP4 + 1D TP8 layout at conc 256,512,768,1024 (1k1k & 8k1k) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an 8-GPU node) feeding one TP8 decode (DECODE_NODES=1) — 3 nodes total. Added to both seq-len scenarios at conc 256,512,768,1024. Eval marking unchanged (still one lm-eval on the 8k1k TP8+TP8 layout). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/amd-master.yaml | 46 +++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index aa4887ad0..ccce01f4f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2801,11 +2801,29 @@ minimaxm3-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - # 8k1k disagg sweep across three P/D layouts (1P TP8 + 1D TP8 conc 1..1024; - # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024). The multi-node - # eval policy (8k1k + conc >= 16) marks one lm-eval on the highest-max-conc - # layout (TP8+TP8, eval-conc=median=128) — validating the M3 MoRI-IO disagg - # pipeline's correctness end-to-end. + # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2) + # feeding one full-node TP8 decode, at high conc 256,512,768,1024. + - spec-decoding: "none" + conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + # 8k1k disagg sweep across four P/D layouts (1P TP8 + 1D TP8 conc 1..1024; + # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024; 2P TP4 + 1D TP8 + # conc 256..1024). The multi-node eval policy (8k1k + conc >= 16) marks one + # lm-eval on the highest-max-conc layout (TP8+TP8, eval-conc=median=128) — + # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end. - isl: 8192 osl: 1024 search-space: @@ -2860,3 +2878,21 @@ minimaxm3-fp8-mi355x-vllm-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" + # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2) + # feeding one full-node TP8 decode, at high conc 256,512,768,1024. + - spec-decoding: "none" + conc-list: [ 256, 512, 768, 1024 ] + prefill: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" From bbb0d78e57eb7aa2d01ef54df8761026fb3510c8 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Mon, 22 Jun 2026 13:52:09 +0000 Subject: [PATCH 18/20] chore(amd-disagg): remove redundant moriio_heterogeneous_kv.py patcher The per-layer READ-offset fix this Python patcher applied to moriio_connector.py is fully subsumed by the unified overlay patches/moriio/moriio-minimax-m3-disagg.diff, which job.slurm applies with `patch -p1` BEFORE server.sh sources setup_deps.sh. The diff rewrites the exact lines the patcher searches for (the `first_layer` single-offset block and the `is_mla = len(self.kv_cache_shape)` sizing), with a stronger geometry-memoized + heterogeneous-TP-aware version, so the patcher's OLD1/OLD2 patterns no longer match and it already no-ops ("pattern not found; skipping") in the real flow. It's also the same fix now upstreamed in vLLM #46039 (READ mixed KV layouts). Drop the dead patcher and its setup_deps.sh hook so the diff is the single source of truth. patches/README.md only documents the diff (no reference to this patcher), so no README change is needed. Co-authored-by: Cursor --- .../patches/moriio_heterogeneous_kv.py | 145 ------------------ benchmarks/multi_node/amd_utils/setup_deps.sh | 23 --- 2 files changed, 168 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py diff --git a/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py b/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py deleted file mode 100644 index a7ee8c724..000000000 --- a/benchmarks/multi_node/amd_utils/patches/moriio_heterogeneous_kv.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -"""Patch vLLM's MoRIIOConnector to transfer heterogeneous KV caches per-layer. - -Why ---- -MiniMax-M3 (MiniMaxM3SparseForCausalLM) is a hybrid sparse-attention model: - - * main attention layers register a ``FullAttentionSpec`` KV cache: - rank-5 ``[2, num_blocks, block_size, num_kv_heads, head_dim]``, **fp8**, K+V - * the lightning indexer (sparse layers) registers a separate - ``MLAAttentionSpec`` index cache (``MiniMaxM3IndexerCache``): - rank-3 ``[num_blocks, block_size, head_dim]``, **bf16**, key-only - -The upstream MoRIIOConnector assumes a *single uniform* KV layout: it derives -``self.kv_cache_shape`` / ``block_len`` / ``element_size`` from the **first** -cache, and ``_read_blocks`` computes the transfer offsets **once** from -``first_layer`` and reuses them for **every** layer (see the in-code TODO -"block_len needs to be per-layer for ... hybrid attn"). For M3 this transfers -the bf16 key-only rank-3 index cache using the fp8 K+V rank-5 main-cache sizing, -corrupting the indexer state on the decode worker. The sparse layers then select -the wrong KV blocks and the model emits incoherent tokens (gsm8k ~= 0). - -This is the vLLM analogue of the already-shipped SGLang MoRI DSA fix in -``patches/mori_conn.py`` (see patches/README.md). - -Fix ---- -Compute transfer geometry **per layer** from each layer's own tensor -(``shape`` / ``stride`` / ``element_size`` / rank), instead of from the first -cache. For homogeneous models every layer's geometry equals the first cache's, -so behaviour is unchanged; only hybrid models (M3) are affected. - -Two minimal, targeted edits (READ path, which the M3 recipe uses with -``read_mode: true``): - - 1. ``_compute_block_transfer_offsets`` -> use ``self.kv_caches[layer_name]``'s - own shape (rank/dims) instead of the global ``self.kv_cache_shape``. - 2. ``_read_blocks`` -> call ``_compute_block_transfer_offsets`` inside the - per-layer loop instead of once for ``first_layer``. - -Idempotent: re-running detects the ``PATCHED heterogeneous-kv`` marker and exits. -""" -import os -import sys - - -def _default_target() -> str: - try: - import vllm - except Exception: - return "" - return os.path.join( - os.path.dirname(vllm.__file__), - "distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py", - ) - - -OLD1 = ''' assert self.kv_cache_shape is not None, "KV caches shape not initialized" - is_mla = len(self.kv_cache_shape) == 3 - stride = self.kv_caches[layer_name].stride() - sz = self.kv_caches[layer_name].element_size() - if is_mla: - blknum, blksize, hs = self.kv_cache_shape - hn = 1 - block_stride = stride[0] - else: - _, blknum, blksize, hn, hs = self.kv_cache_shape''' - -NEW1 = ''' # [PATCHED heterogeneous-kv] Use this layer's own shape so caches with a - # different rank/dtype (MiniMax-M3: bf16 key-only rank-3 index cache vs - # fp8 K+V rank-5 main cache) are sized per-layer, not from the first cache. - layer_shape = tuple(self.kv_caches[layer_name].shape) - assert layer_shape, "KV caches shape not initialized" - is_mla = len(layer_shape) == 3 - stride = self.kv_caches[layer_name].stride() - sz = self.kv_caches[layer_name].element_size() - if is_mla: - blknum, blksize, hs = layer_shape - hn = 1 - block_stride = stride[0] - else: - _, blknum, blksize, hn, hs = layer_shape''' - -OLD2 = ''' first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0] - offs = self._compute_block_transfer_offsets( - first_layer, local_block_ids, remote_block_ids, remote_moriio_meta - ) - - for layer_name in self.layer_name_to_local_kv_cache_metadata: - sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( - layer_name - ) - # TODO : apply multi-session batch-read when moriio support it - transfer_status = self.moriio_wrapper.read_remote_data( - offs[2], offs[0], offs[1], sessions[sess_idx] - )''' - -NEW2 = ''' for layer_name in self.layer_name_to_local_kv_cache_metadata: - sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( - layer_name - ) - # [PATCHED heterogeneous-kv] Per-layer offsets so the bf16 key-only - # MiniMax-M3 index cache is transferred with its own geometry instead - # of the first (main fp8 K+V) layer's. - offs = self._compute_block_transfer_offsets( - layer_name, local_block_ids, remote_block_ids, remote_moriio_meta - ) - # TODO : apply multi-session batch-read when moriio support it - transfer_status = self.moriio_wrapper.read_remote_data( - offs[2], offs[0], offs[1], sessions[sess_idx] - )''' - - -def main() -> int: - target = sys.argv[1] if len(sys.argv) > 1 else _default_target() - if not target or not os.path.isfile(target): - print(f"[PATCH] moriio_connector.py not found ({target!r}); skipping") - return 0 - src = open(target).read() - if "PATCHED heterogeneous-kv" in src: - print("[PATCH] moriio heterogeneous-kv already applied") - return 0 - if OLD1 not in src: - print("[PATCH] WARN: _compute_block_transfer_offsets pattern not found; " - "connector version changed — skipping (no-op)") - return 0 - if OLD2 not in src: - print("[PATCH] WARN: _read_blocks pattern not found; " - "connector version changed — skipping (no-op)") - return 0 - src = src.replace(OLD1, NEW1, 1).replace(OLD2, NEW2, 1) - # Validate it still compiles before writing. - try: - compile(src, target, "exec") - except SyntaxError as e: - print(f"[PATCH] ERROR: patched source fails to compile: {e}") - return 1 - open(target, "w").write(src) - print("[PATCH] Applied: moriio heterogeneous-kv per-layer transfer " - "(MiniMax-M3 sparse index cache)") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/benchmarks/multi_node/amd_utils/setup_deps.sh b/benchmarks/multi_node/amd_utils/setup_deps.sh index 3e5d82c0c..35eaf17dc 100644 --- a/benchmarks/multi_node/amd_utils/setup_deps.sh +++ b/benchmarks/multi_node/amd_utils/setup_deps.sh @@ -185,28 +185,6 @@ install_transformers_glm5() { _SETUP_INSTALLED+=("transformers-glm5") } -# --------------------------------------------------------------------------- -# vLLM: Patch MoRIIOConnector for heterogeneous (hybrid sparse-attn) KV caches. -# -# MiniMax-M3 registers a bf16 key-only rank-3 lightning-indexer cache alongside -# the fp8 K+V rank-5 main cache. Upstream MoRIIO derives one uniform block -# geometry from the first cache and reuses the first layer's transfer offsets -# for every layer, corrupting the index cache on the decode worker -> garbage -# output (gsm8k ~= 0). The overlay makes the READ path compute geometry/offsets -# per layer. Idempotent; no-op on connector versions that don't match. -# See patches/moriio_heterogeneous_kv.py and patches/README.md. -# --------------------------------------------------------------------------- -patch_moriio_heterogeneous_kv() { - local patcher - patcher="$(dirname "${BASH_SOURCE[0]}")/patches/moriio_heterogeneous_kv.py" - if [[ ! -f "$patcher" ]]; then - echo "[SETUP] moriio heterogeneous-kv patcher not found, skipping" - return 0 - fi - python3 "$patcher" || echo "[SETUP] WARN: moriio heterogeneous-kv patch returned non-zero" - _SETUP_INSTALLED+=("moriio-heterogeneous-kv") -} - # ============================================================================= # Run installers (engine-gated) # ============================================================================= @@ -214,7 +192,6 @@ patch_moriio_heterogeneous_kv() { if [[ "$ENGINE" == "vllm-disagg" ]]; then install_recipe_deps install_amd_quark - patch_moriio_heterogeneous_kv # ========================================================================= # vLLM: Export UCX/RIXL paths (persists since this file is sourced) From 815c78c3594ff9626937ff4816265282da4f9518 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 24 Jun 2026 12:55:22 +0000 Subject: [PATCH 19/20] Use upstream nightly image for MiniMax-M3 disagg, drop MoRIIO overlay - Co-work with Gupta, Ravi All three MoRIIO fixes the in-tree overlay carried have merged upstream and now ship in the ROCm nightly image: - vLLM #46039 READ-mode mixed KV-layout (axis-aware per-layer offsets) - vLLM #46290 WRITE-mode per-geometry offset caching - vLLM #46332 heterogeneous-TP rank mapping + ACK fan-in Point minimaxm3-fp8-mi355x-vllm-disagg at vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15 (vLLM 0.23.1rc1.dev363+g556bc4e3a, which contains all three merges) and remove the stop-gap overlay: - delete patches/moriio/moriio-minimax-m3-disagg.diff - drop the job.slurm in-container auto-apply block (+ MORIIO_KV_PATCH gate) - trim the moriio/ section from patches/README.md Verified on the nightly image with NO patch across all four P/D layouts x conc {1,4,8}, gsm8k strict/flexible 0.95-0.97 (1P8+1D8, 1P4+1D8, 1P4+1D4, 2P4+1D8) -- matching the previously-patched results. Refs #1762. --- .github/configs/amd-master.yaml | 2 +- benchmarks/multi_node/amd_utils/job.slurm | 49 -- .../multi_node/amd_utils/patches/README.md | 105 +--- .../moriio/moriio-minimax-m3-disagg.diff | 483 ------------------ 4 files changed, 6 insertions(+), 633 deletions(-) delete mode 100644 benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8e017fa96..38908ec86 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2737,7 +2737,7 @@ minimaxm3-fp8-mi325x-vllm-mtp: # TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in # benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8). minimaxm3-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:minimax-m3 + image: vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 8a1fda4c8..977bcaecc 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -80,54 +80,6 @@ if [[ "${MORI_CONN_PATCH:-auto}" != "skip" ]] \ export EXTRA_DOCKER_MOUNTS echo "[job.slurm] auto-applied MoRI conn.py overlay: ${_MORI_PATCH_FILE}" fi - -# ── In-tree vLLM MoRIIO patch: auto-apply for known-affected images ── -# The vLLM MoRIIOConnector (image vllm/vllm-openai-rocm:minimax-m3) ships a -# transposed-KV-layout bug: it assumes the FlashAttention layout -# [2, num_blocks, ...] (K/V axis outer) but this vLLM's backends allocate -# [num_blocks, 2, ...] (K/V axis inner), so every disagg block transfer reads -# the wrong region. Invisible to throughput, but corrupts GQA/non-MLA accuracy -# (MiniMax-M3 gsm8k 0.0008 -> 0.958). Fix ships as a unified diff (see -# patches/moriio/ and patches/README.md), applied to the vLLM package dir -# inside the container at startup, ahead of the server launch below. -# -# Auto-applied when the image tag contains "minimax-m3" (and not the already- -# fixed "-hetkv" rebuild), unless the caller sets MORIIO_KV_PATCH=skip. The -# repo is already bind-mounted at DOCKER_MOUNT_PATH ("/workspace"), so the -# diff needs no extra mount -- just an in-container `patch` call. A failed -# apply aborts the container: silently running unpatched would silently -# corrupt accuracy, not just skip a feature. -# -# The single diff bundles three layered fixes (all in patches/moriio/, -# see patches/README.md): -# 1. KV-layout: the load-bearing accuracy fix (axis-aware per-layer offsets; -# gsm8k 0.0008 -> 0.958). Required for homogeneous TP too. -# 2. heterogeneous-TP (no-op for homogeneous TP, required for -# PREFILL_TP_SIZE != DECODE_TP_SIZE -- see nvidia/amd-master.yaml's TP4+TP8 -# configs): handshake/notify port addressing maps each decode rank to the -# correct prefill rank instead of its own raw tp_rank (stock -# MoRIIOConnector has no fan-out concept at all), and guards (fail loud) -# the KV-head-split / prefill-TP>decode-TP cases MoRIIO can't serve. -# 3. dup-ack: with DECODE_TP_SIZE > PREFILL_TP_SIZE, N decode ranks fan in to -# 1 prefill rank and each sends its own completion ack for the same -# transfer_id. Freeing KV blocks on the first ack (the original -# MoRIIOConnector behavior) both crashes EngineCore on the late second ack -# (AssertionError in Scheduler._update_from_kv_xfer_finished) and risks -# silently corrupting KV if the slower decode rank's read is still in -# flight when the blocks are reused. Fix mirrors NIXL's -# consumer_notification_counts_by_req: producer counts acks per -# transfer_id (consumer embeds its own tp_size in the notify message) and -# only frees once all expected consumers have acked. -_MORIIO_DIFF="$DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff" -_MORIIO_PATCH_CMD="" -if [[ "${MORIIO_KV_PATCH:-auto}" != "skip" ]] \ - && [[ -f "$_MORIIO_DIFF" ]] \ - && [[ "${DOCKER_IMAGE_NAME:-}" == *"minimax-m3"* ]] \ - && [[ "${DOCKER_IMAGE_NAME:-}" != *"hetkv"* ]]; then - _MORIIO_PATCH_CMD="patch -p1 -d /usr/local/lib/python3.12/dist-packages < /workspace/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff || exit 1" - echo "[job.slurm] will auto-apply vLLM MoRIIO KV-layout + heterogeneous-TP + dup-ack diff inside container: ${_MORIIO_DIFF}" -fi - xP="${xP:-1}" yD="${yD:-1}" @@ -640,7 +592,6 @@ fi \"$DOCKER_IMAGE_NAME\" bash -lc ' set -o pipefail mkdir -p /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"' - ${_MORIIO_PATCH_CMD:-} '"$RUN_FILE_FULL"' 2>&1 | tee /run_logs/slurm_job-'\"\$SLURM_JOB_ID\"'/server_\$(hostname).log ' diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md index 4941540e3..765d571b2 100644 --- a/benchmarks/multi_node/amd_utils/patches/README.md +++ b/benchmarks/multi_node/amd_utils/patches/README.md @@ -8,18 +8,16 @@ block our benchmark + accuracy configs — so we can keep reusing the - `mori_conn.py` — single-file overlay (bind-mounted) for the **sglang** MoRI backend. -- `moriio/` — unified-diff overlay (applied with `patch` at container - startup) for the **vLLM** MoRIIO connector (`minimax-m3` image). See its - section below. + +> Note: the vLLM MoRIIO `minimax-m3` overlay (`moriio/`) was retired once the +> upstream fixes (vLLM #46039 / #46290 / #46332) shipped in the ROCm nightly +> image; `minimaxm3-fp8-mi355x-vllm-disagg` now runs the stock nightly directly. The `mori_conn.py` overlay is wired through the `EXTRA_DOCKER_MOUNTS` env var that `job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after the existing `-v` block). The local-test driver scripts under `scripts/sglang_disagg/` pre-set this env var to the path of the relevant -overlay; CI runners that need the patch can do the same. The `moriio/` -diff needs no extra mount — the repo (and thus the diff file) is already -bind-mounted into the container — `job.slurm` just runs `patch` against it -before launching the server; see "How to enable" in its section below. +overlay; CI runners that need the patch can do the same. ## `mori_conn.py` @@ -82,99 +80,6 @@ When this env var is unset (CI default for runs that don't need the patch), `${EXTRA_DOCKER_MOUNTS:-}` expands to the empty string and container behavior is byte-identical to the unpatched path. -## `moriio/` (vLLM MoRIIO connector, MiniMax-M3) - -A single unified diff (`moriio-minimax-m3-disagg.diff`), applied with -`patch -p1` against the vLLM package dir inside the container, touching -three files: - -``` -/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/moriio/ - ├── moriio_connector.py - ├── moriio_engine.py - └── moriio_common.py -``` - -Source: forked from the stock `vllm/vllm-openai-rocm:minimax-m3` image -(vLLM `0.22.1rc1.dev490`). - -**Bug (general MoRIIO, not M3-specific):** the connector assumed the -FlashAttention KV layout `[2, num_blocks, block_size, heads, head_dim]` -(K/V axis **outer**), but this vLLM's attention backends (standard -`TRITON_ATTN` **and** the M3 sparse backend) allocate -`[num_blocks, 2, block_size, heads, head_dim]` (K/V axis **inner**). -`_compute_block_transfer_offsets` indexed blocks with `stride[1]` (the -K/V stride) instead of `stride[0]` (the block stride), so every disagg -block transfer read the wrong region. Invisible to throughput -benchmarks (they don't check output); only the **gsm8k accuracy eval** -catches it. The connector was only ever correct for MLA models -(DeepSeek, rank-3 path); MiniMax-M3 is GQA + sparse lightning-indexer -→ broken (disagg gsm8k `0.0008` token salad). - -**Fix** — axis-aware offset computation: detect the block axis + optional -size-2 K/V axis from each layer's real shape/stride, compute offsets per -distinct geometry (handles M3's 2nd geometry, the rank-3 bf16 key-only -indexer cache), `num_blocks = shape[0]`; the WRITE path memoizes offsets -per geometry. Result: disagg gsm8k `strict-match 0.9583 / -flexible-extract 0.9575` (matches single-node). Homogeneous models -(uniform layout) are unaffected — one geometry, one offset set, same -result. Full write-up in -`/apps/ditian12/m3_disagg_manual/moriio_hetkv_fix/README.md`. - -The diff also bundles two heterogeneous-TP layers (no-op for homogeneous -TP, exercised by `nvidia/amd-master.yaml`'s TP4-prefill + TP8-decode -configs): - -- **heterogeneous-TP addressing + guard:** stock MoRIIOConnector always - addresses remote rank == local `tp_rank`, which has no listener once - `DECODE_TP_SIZE > PREFILL_TP_SIZE`. `_remote_tp_rank` maps each decode - rank to the correct single prefill rank. Two regimes, both requiring - **replicated** KV heads (`tp_size >= total_kv_heads`, ≤1 distinct head - per rank — MiniMax-M3 has 4 KV heads, so any TP≥4 is replicated): - - `D-TP > P-TP` (e.g. P4/D8): `tp_rank // ratio`, mirroring NIXL's - `TpKVTopology.get_target_remote_ranks`. Multiple decode ranks read - from one prefill rank. - - `P-TP > D-TP` (e.g. P8/D4): vLLM distributes heads across prefill - ranks in consecutive pairs — (rank0,rank1)→head0, (rank2,rank3)→head1, - etc. Decode rank k must connect to the **first** rank of its head group: - `tp_rank * ratio`. Using `tp_rank` directly (as the original patch did) - is wrong for ranks > 0: decode rank 1 lands on prefill rank 1 (holds - head0) instead of prefill rank 2 (holds head1), producing garbage KV. - The one unsupported case — KV-head **splitting** (`total_kv_heads > - prefill_tp`, where each prefill rank holds a distinct head subset that - a decode rank would need to slice from NHD layout, unrepresentable as a - single `(offset,len)` per block) — **raises `NotImplementedError`** in - `_compute_block_transfer_offsets`. (NIXL likewise only splits heads in - HND layout and raises otherwise.) -- **dup-ack fan-in:** with `DECODE_TP_SIZE > PREFILL_TP_SIZE`, N decode - ranks read from one prefill rank and each ACKs the same `transfer_id`. - The producer now counts ACKs per `transfer_id` (consumer embeds its own - `tp_size` in the notify payload) and only reports `finished_sending` - once all expected consumers have ACKed — preventing both the late-ACK - `EngineCore` crash and freeing/reusing KV blocks while a slower decode - rank is still reading. Mirrors NIXL's - `consumer_notification_counts_by_req`. - -### How to enable - -`job.slurm` auto-applies this diff when `DOCKER_IMAGE_NAME` contains -`minimax-m3` (and not the already-fixed `-hetkv` rebuild), unless the -caller sets `MORIIO_KV_PATCH=skip`. To wire it by hand (e.g. the -`m3_disagg_manual/run_manual_2node.sh` driver, which sets -`MORIIO_KV_PATCH`), run inside the container before the server starts: - -```bash -patch -p1 -d /usr/local/lib/python3.12/dist-packages \ - < $DI_REPO_DIR/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff -``` - -(`$DI_REPO_DIR` is the InferenceX checkout root that `job.slurm` already -mounts into the container at `/workspace`.) - -This lets the **stock** `minimax-m3` image be reused for the E2E -accuracy run — no `-hetkv` rebuild needed. Retire the overlay once the -fix lands in a published image; it is not yet upstreamed. - ## When to use which patch | Image / version | Need `mori_conn.py` overlay? | diff --git a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff b/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff deleted file mode 100644 index 83ae80d13..000000000 --- a/benchmarks/multi_node/amd_utils/patches/moriio/moriio-minimax-m3-disagg.diff +++ /dev/null @@ -1,483 +0,0 @@ ---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py -+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_common.py -@@ -80,6 +80,10 @@ - writes_done: int = 0 - decode_dp_rank: int = 0 - transfer_offset: tuple[list[int], list[int], list[int]] | None = None -+ # Per-layer-geometry offset cache (keyed by shape/stride/dtype) for -+ # heterogeneous-KV (hybrid/sparse) models. Homogeneous models populate a -+ # single entry. See MoRIIOWriter._prepare_transfer_plan. -+ transfer_offsets: dict = field(default_factory=dict) - - - class ROLE(Enum): ---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py -+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py -@@ -740,6 +740,21 @@ - # Completions that arrived before transfer_id_to_request_id was populated. - # Retried each step until the mapping is established. - self._unmatched_write_completions: set[str] = set() -+ # Producer side: with heterogeneous-TP P/D (DECODE_TP_SIZE > -+ # PREFILL_TP_SIZE), more than one decode rank reads this rank's KV -+ # slice and each sends its own completion notify for the SAME -+ # transfer_id once its own read finishes. We must not report -+ # finished_sending (which lets the core scheduler free/reuse the -+ # blocks) until *all* of them have acked -- otherwise a still-in- -+ # flight slower reader can read corrupted/reused memory. This counts -+ # per-transfer_id notifies against the expected fan-in count (derived -+ # from the consumer's own tp_size, sent alongside the transfer_id -- -+ # see send_notify call sites) and only resolves once it's complete. -+ # Mirrors NIXL's consumer_notification_counts_by_req. Pruned in -+ # start_load_kv() once a transfer_id drops out of the live mapping -+ # (e.g. force-freed by the scheduler's defer_timeout without ever -+ # reaching full count). -+ self._consumer_notification_counts: dict[str, int] = {} - - role = "producer" if self.is_producer else "consumer" - engine_suffix = ( -@@ -1085,6 +1100,44 @@ - req_id.decode(), - ) - -+ def _remote_tp_rank(self, remote_tp_size: int) -> int: -+ """Map this worker's local tp_rank to the single remote tp_rank it must -+ address when local and remote TP sizes differ (heterogeneous-TP P/D). -+ -+ Two regimes (both require KV heads to be REPLICATED, not split -- see -+ guard in _compute_block_transfer_offsets): -+ -+ * decode-TP > prefill-TP (e.g. P4/D8): multiple decode ranks (in groups -+ of ratio = decode_tp // prefill_tp) share one prefill rank's KV slice. -+ floor(local_tp_rank / ratio) maps each decode rank to its prefill rank. -+ Mirrors NIXL TpKVTopology.get_target_remote_ranks. -+ * prefill-TP > decode-TP (e.g. P8/D4): vLLM distributes 4 KV heads -+ across 8 prefill ranks in consecutive pairs -- (rank0,rank1)→head0, -+ (rank2,rank3)→head1, etc. Each decode rank must address the FIRST rank -+ of its paired group: local_tp_rank * ratio (NOT the same-indexed rank, -+ which would land in the wrong head's group for ranks > 0). -+ Head-splitting is rejected in _compute_block_transfer_offsets. -+ """ -+ if remote_tp_size == self.world_size: -+ return self.tp_rank -+ if remote_tp_size > self.world_size: -+ # Prefill-TP > decode-TP (e.g. P8/D4, replicated KV heads). -+ # vLLM pairs prefill ranks per head: decode rank k must connect to -+ # the first prefill rank of its head group (k * ratio), NOT rank k. -+ # Example (P8/D4, 4 KV heads): decode rank 1 (head1) → prefill -+ # rank 2 (not rank 1, which holds head0 alongside rank 0). -+ assert remote_tp_size % self.world_size == 0, ( -+ f"remote tp_size {remote_tp_size} must be a multiple of local " -+ f"tp_size {self.world_size} for heterogeneous-TP P/D" -+ ) -+ return self.tp_rank * (remote_tp_size // self.world_size) -+ # Decode-TP > prefill-TP: floor-map multiple decode ranks to one prefill rank. -+ assert self.world_size % remote_tp_size == 0, ( -+ f"local tp_size {self.world_size} must be a multiple of remote " -+ f"tp_size {remote_tp_size} for heterogeneous-TP P/D" -+ ) -+ return self.tp_rank // (self.world_size // remote_tp_size) -+ - def _moriio_handshake( - self, - host: str, -@@ -1101,7 +1150,9 @@ - # a hack to keep us moving. We will switch when moving to etcd - # or where we have a single ZMQ socket in the scheduler. - -- port_offset = get_port_offset(remote_dp_rank, self.tp_rank) -+ port_offset = get_port_offset( -+ remote_dp_rank, self._remote_tp_rank(remote_tp_size) -+ ) - path = make_zmq_path("tcp", host, port + port_offset) - logger.debug("handshake Querying metadata on path: %s", path) - -@@ -1233,8 +1284,10 @@ - block_size, kv_latent_dim = block_shape - self.slot_size_bytes = kv_elem_size * kv_latent_dim - else: -- # [2 (k and v), num_blocks, ...] -- self.num_blocks = first_kv_cache.shape[1] -+ # Layout (num_blocks, 2, block_size, kv_heads, head_dim): the K/V -+ # axis is INNER (axis 1) and num_blocks is axis 0. (The old code read -+ # shape[1] here, which is the size-2 K/V axis, not num_blocks.) -+ self.num_blocks = first_kv_cache.shape[0] - block_rank = 3 # [block_size, kv_heads, head_dim] - block_shape = first_kv_cache.shape[-block_rank:] - block_size, n_kv_heads, head_dim = block_shape[-3:] -@@ -1257,10 +1310,17 @@ - caches_data = [] - - for cache_or_caches in kv_caches.values(): -- cache_list = [cache_or_caches] if use_mla else cache_or_caches -+ # Per-layer rank: rank-3 (MLA / sparse indexer, single tensor) vs -+ # rank-5 (full attention, [K, V]). A single global use_mla flag -+ # mis-iterates the rank-3 indexer cache (over its num_blocks dim) for -+ # hybrid models, so detect per cache. region_len is the actual tensor -+ # (or K/V half) byte size -- equivalent to num_blocks * block_len for -+ # homogeneous models, correct for heterogeneous ones. -+ cache_is_mla = cache_or_caches.dim() == 3 -+ cache_list = [cache_or_caches] if cache_is_mla else cache_or_caches - for cache in cache_list: - base_addr = cache.data_ptr() -- region_len = self.num_blocks * self.block_len -+ region_len = cache.numel() * cache.element_size() - caches_data.append((base_addr, region_len, cache.device.index, "")) - kv_caches_base_addr.append(base_addr) - -@@ -1338,13 +1398,45 @@ - done_sending, done_recving = set(), set() - - if self.is_producer: -- # pop_finished_req_ids returns transfer_ids (the ZMQ payload sent -- # by decode via send_notify); map back to req_ids for the scheduler. -- finished_transfer_ids = self.moriio_wrapper.pop_finished_req_ids() -+ # pop_finished_req_ids returns every completion message received -+ # since the last call (NOT deduped -- with heterogeneous-TP -+ # fan-out, two different decode ranks legitimately send -+ # byte-identical messages for the same transfer_id and each one -+ # must be counted). Payload is ":" -+ # (see send_notify call sites); plain transfer_id with no ":" is -+ # treated as a 1:1 ack (internal WRITE-mode completions). -+ finished_transfer_msgs = self.moriio_wrapper.pop_finished_req_ids() -+ resolved_transfer_ids: set[str] = set() -+ for raw_msg in finished_transfer_msgs: -+ xfer_id, _, tp_size_str = raw_msg.rpartition(":") -+ if not xfer_id: -+ xfer_id, tp_size_str = raw_msg, str(self.world_size) -+ if xfer_id not in self.transfer_id_to_request_id: -+ logger.warning( -+ "Could not find %s in transfer_id_to_request_id " -+ "lookup table. This could lead to a possible hang.", -+ xfer_id, -+ ) -+ continue -+ consumer_tp_size = int(tp_size_str) -+ if consumer_tp_size > self.world_size: -+ assert consumer_tp_size % self.world_size == 0, ( -+ f"consumer tp_size {consumer_tp_size} must be a " -+ f"multiple of producer tp_size {self.world_size} " -+ "for heterogeneous-TP P/D" -+ ) -+ expected_acks = consumer_tp_size // self.world_size -+ else: -+ expected_acks = 1 -+ count = self._consumer_notification_counts.get(xfer_id, 0) + 1 -+ if count >= expected_acks: -+ self._consumer_notification_counts.pop(xfer_id, None) -+ resolved_transfer_ids.add(xfer_id) -+ else: -+ self._consumer_notification_counts[xfer_id] = count - done_sending = { - self.transfer_id_to_request_id[xfer_id] -- for xfer_id in finished_transfer_ids -- if xfer_id in self.transfer_id_to_request_id -+ for xfer_id in resolved_transfer_ids - } - else: - if self.mode == MoRIIOMode.WRITE: -@@ -1389,7 +1481,13 @@ - if last.Succeeded(): - host, port, xfer_id = self._recving_transfers_callback_addr[req_id] - done_req_ids.add(xfer_id) -- self.moriio_wrapper.send_notify(xfer_id, host, port) -+ # Embed our own tp_size so the producer can tell, with -+ # heterogeneous-TP fan-out, how many consumer acks to -+ # expect for this transfer_id before it's safe to free -+ # the blocks (see _consumer_notification_counts). -+ self.moriio_wrapper.send_notify( -+ f"{xfer_id}:{self.world_size}", host, port -+ ) - to_remove.append(req_id) - elif last.Failed(): - logger.error( -@@ -1402,7 +1500,9 @@ - ) - host, port, xfer_id = self._recving_transfers_callback_addr[req_id] - try: -- self.moriio_wrapper.send_notify(xfer_id, host, port) -+ self.moriio_wrapper.send_notify( -+ f"{xfer_id}:{self.world_size}", host, port -+ ) - except Exception: - logger.exception( - "Failed to send error notification for request %s", -@@ -1488,6 +1588,15 @@ - """ - self.transfer_id_to_request_id = metadata.transfer_id_to_request_id - if self.is_producer: -+ # Drop counts for transfer_ids that dropped out of the live -+ # mapping without ever reaching full ack count (e.g. force-freed -+ # by the scheduler's defer_timeout) -- they can never resolve via -+ # get_finished() anymore, so stop tracking them to bound memory. -+ self._consumer_notification_counts = { -+ xfer_id: count -+ for xfer_id, count in self._consumer_notification_counts.items() -+ if xfer_id in self.transfer_id_to_request_id -+ } - self.moriio_wrapper.async_wait_reqid() - return - if self.mode == MoRIIOMode.WRITE: -@@ -1560,6 +1669,7 @@ - remote_block_ids=meta.remote_block_ids, - remote_host=meta.remote_host, - remote_notify_port=meta.remote_notify_port, -+ remote_tp_size=meta.tp_size, - ) - - def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_layer): -@@ -1653,6 +1763,7 @@ - local_block_ids: list[int], - remote_block_ids: list[int], - remote_moriio_meta: MoRIIOAgentMetadata, -+ remote_tp_size: int | None = None, - ) -> tuple[list[int], list[int], list[int]]: - """Compute transfer offsets for block data. - -@@ -1661,25 +1772,100 @@ - local_block_ids: IDs of local blocks - remote_block_ids: IDs of remote blocks - remote_moriio_meta: Metadata of the remote MoRIIO agent -+ remote_tp_size: tp_size of the remote (producer/prefill) instance. -+ Defaults to this worker's world_size (homogeneous P/D TP). When -+ it differs, used to validate that KV heads are replicated (the -+ only heterogeneous-TP regime MoRIIO supports) -- see the guard -+ below. - Returns: - Tuple of (local_offsets, remote_offsets, transfer_sizes) - """ - assert self.kv_cache_shape is not None, "KV caches shape not initialized" -- is_mla = len(self.kv_cache_shape) == 3 -- stride = self.kv_caches[layer_name].stride() -- sz = self.kv_caches[layer_name].element_size() -- if is_mla: -- blknum, blksize, hs = self.kv_cache_shape -- hn = 1 -- block_stride = stride[0] -- else: -- _, blknum, blksize, hn, hs = self.kv_cache_shape -- local_ktov_stride = stride[0] -- block_stride = stride[1] -- remote_ktov_stride = block_stride * remote_moriio_meta.num_blocks -+ # Per-layer, axis-aware geometry. -+ # -+ # The KV tensors vLLM hands the connector are laid out (verified on -+ # MiniMax-M3 / TRITON_ATTN, vLLM 0.22.1): -+ # * main attention (GQA, dense + sparse layers): -+ # shape (num_blocks, 2, block_size, num_kv_heads, head_dim) -+ # -- the K/V axis (size 2) is INNER (axis 1), num_blocks is axis 0. -+ # * sparse lightning-indexer (key-only, bf16): -+ # shape (num_blocks, block_size, head_dim) -- rank 3, no K/V axis. -+ # -+ # The legacy code assumed the FlashAttention-style [2, num_blocks, ...] -+ # layout (K/V axis OUTER) and indexed blocks with stride[1] (the K/V -+ # stride) -- transposing block vs K/V so every block read the wrong -+ # region (corruption invisible to throughput-only benchmarks). Instead, -+ # detect the block axis (size == num_blocks) and the optional K/V axis -+ # (size 2) from THIS layer's own shape, and derive strides from them. The -+ # per-block stride is independent of num_blocks, so no remote-num_blocks -+ # scaling is needed. -+ layer_cache = self.kv_caches[layer_name] -+ layer_shape = tuple(layer_cache.shape) -+ stride = layer_cache.stride() -+ sz = layer_cache.element_size() -+ rank = len(layer_shape) -+ -+ # K/V axis = the size-2 axis among the two outermost dims (if any). -+ kv_axis: int | None = None -+ if rank >= 4: -+ if layer_shape[0] == 2: -+ kv_axis = 0 -+ elif layer_shape[1] == 2: -+ kv_axis = 1 -+ # Block axis = outermost non-K/V axis (the one indexed by block_id). -+ block_axis = 0 -+ if kv_axis == 0: -+ block_axis = 1 -+ block_stride = stride[block_axis] -+ kv_stride = stride[kv_axis] if kv_axis is not None else 0 -+ per_block = layer_shape[kv_axis] if kv_axis is not None else 1 # 2 (K,V) or 1 -+ -+ # One transferred slab = all dims except the block and K/V axes. -+ slot_elems = 1 -+ for ax in range(rank): -+ if ax == block_axis or ax == kv_axis: -+ continue -+ slot_elems *= layer_shape[ax] -+ -+ # --- Heterogeneous-TP guard (mirrors NIXL add_remote_agent) ----------- -+ # When P/D TP sizes differ, _remote_tp_rank maps each decode rank to a -+ # single remote rank; that whole-block read is byte-correct only when KV -+ # heads are REPLICATED on the remote (prefill) side. -+ # -+ # Supported regimes (replicated heads, i.e. remote_heads <= local_heads): -+ # * D-TP > P-TP (e.g. P4/D8): multiple decode ranks share one prefill -+ # rank's slice (floor-ratio mapping). -+ # * P-TP > D-TP (e.g. P8/D4): each decode rank reads from same-indexed -+ # prefill rank (self.tp_rank mapping). MiniMax-M3's regime: 4 KV heads -+ # fully replicated at TP>=4. -+ # -+ # Unsupported: heads SPLIT on prefill (remote_heads > local_heads). -+ # MoRIIO's NHD layout (heads interleaved per token) makes a head slice -+ # non-contiguous and inexpressible as a single (offset, len) per block. -+ # NIXL raises for the same reason; we do the same. MLA / rank-3 indexer -+ # caches are always replicated (no K/V axis) and bypass this guard. -+ local_tp = self.world_size -+ remote_tp = remote_tp_size if remote_tp_size is not None else local_tp -+ if remote_tp != local_tp and not self.use_mla and kv_axis is not None: -+ total_kv_heads = self.model_config.get_total_num_kv_heads() -+ remote_heads = max(1, total_kv_heads // remote_tp) -+ local_heads = max(1, total_kv_heads // local_tp) -+ if remote_heads > local_heads: -+ # KV heads are SPLIT on prefill -- whole-block read is incorrect. -+ # Applies in both TP-mismatch directions; fail loud. -+ raise NotImplementedError( -+ f"Heterogeneous-TP head splitting (total_kv_heads " -+ f"{total_kv_heads} > prefill tp_size {remote_tp}: " -+ f"{remote_heads} heads/rank on prefill vs {local_heads} on " -+ "decode) requires per-head slicing of an NHD KV layout, not " -+ "supported by MoRIIOConnector. Use PREFILL_TP_SIZE >= " -+ "total_kv_heads so KV heads are replicated." -+ ) -+ # remote_heads <= local_heads: replicated. _remote_tp_rank selects the -+ # correct remote rank; whole-block read is byte-correct. -+ -+ transfer_size_byte = slot_elems * sz - -- transfer_size_byte = blksize * hn * hs * sz -- per_block = 1 if is_mla else 2 - total = len(local_block_ids) * per_block - offset_local = [0] * total - offset_remote = [0] * total -@@ -1688,17 +1874,9 @@ - w = 0 - for i, lb in enumerate(local_block_ids): - rb = remote_block_ids[i] -- # K -- offset_local[w] = sz * (lb * block_stride) -- offset_remote[w] = sz * (rb * block_stride) -- w += 1 -- if not is_mla: -- # V -- # Handle num_block variations originating from PD (different kv strides) -- # TODO: address block_sz differences in heterogeneous TP scenarios -- # In MLA, we don't need to consider these two cases. -- offset_local[w] = sz * (1 * local_ktov_stride + lb * block_stride) -- offset_remote[w] = sz * (1 * remote_ktov_stride + rb * block_stride) -+ for kv in range(per_block): -+ offset_local[w] = sz * (lb * block_stride + kv * kv_stride) -+ offset_remote[w] = sz * (rb * block_stride + kv * kv_stride) - w += 1 - - merged_l, merged_r, merged_s = self.merge_contiguous_blocks( -@@ -1715,6 +1893,7 @@ - transfer_id: str, - remote_host: str, - remote_notify_port: int, -+ remote_tp_size: int, - ) -> None: - if self.mode == MoRIIOMode.WRITE: - return -@@ -1722,15 +1901,30 @@ - dp0_engine_id = self.get_engine_name_with_dp(dst_engine_id, 0) - sessions, remote_moriio_meta = self._get_built_session(dp0_engine_id) - -- first_layer = list(self.layer_name_to_local_kv_cache_metadata.keys())[0] -- offs = self._compute_block_transfer_offsets( -- first_layer, local_block_ids, remote_block_ids, remote_moriio_meta -- ) -- -- for layer_name in self.layer_name_to_local_kv_cache_metadata: -- sess_idx = list(self.layer_name_to_local_kv_cache_metadata.keys()).index( -- layer_name -+ # Heterogeneous-KV models register layers with different shapes/dtypes in -+ # a single KV-cache group sharing one block table, so block_ids match -+ # across layers but per-block byte geometry does not. Compute offsets per -+ # distinct layer geometry (memoized by shape/stride/dtype) so the rank-3 -+ # bf16 indexer cache isn't read with the rank-5 fp8 main-cache sizing. -+ layer_names = list(self.layer_name_to_local_kv_cache_metadata.keys()) -+ offs_by_geom: dict = {} -+ for sess_idx, layer_name in enumerate(layer_names): -+ layer_cache = self.kv_caches[layer_name] -+ geom_key = ( -+ tuple(layer_cache.shape), -+ tuple(layer_cache.stride()), -+ layer_cache.dtype, - ) -+ offs = offs_by_geom.get(geom_key) -+ if offs is None: -+ offs = self._compute_block_transfer_offsets( -+ layer_name, -+ local_block_ids, -+ remote_block_ids, -+ remote_moriio_meta, -+ remote_tp_size=remote_tp_size, -+ ) -+ offs_by_geom[geom_key] = offs - # TODO : apply multi-session batch-read when moriio support it - transfer_status = self.moriio_wrapper.read_remote_data( - offs[2], offs[0], offs[1], sessions[sess_idx] -@@ -1739,6 +1933,6 @@ - self._recving_transfers[request_id].append(transfer_status) - self._recving_transfers_callback_addr[request_id] = ( - remote_host, -- str(remote_notify_port + self.tp_rank), -+ str(remote_notify_port + self._remote_tp_rank(remote_tp_size)), - transfer_id, - ) ---- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py -+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_engine.py -@@ -279,21 +279,33 @@ - Returns: - The transfer plan - """ -- # Compute offsets if not cached -- if request_info.transfer_offset is None: -+ # Compute offsets per distinct layer geometry. Heterogeneous-KV models -+ # (e.g. MiniMax-M3's sparse indexer) place rank-3 bf16 and rank-5 fp8 -+ # caches in one KV-cache group; caching a single offset set per request -+ # and reusing it for every layer corrupts the indexer cache. Block_ids -+ # are shared (single block table), so offsets depend only on the layer's -+ # shape/stride/dtype -- memoize by that geometry key. -+ layer_cache = self.worker.kv_caches[task.layer_name] -+ geom_key = ( -+ tuple(layer_cache.shape), -+ tuple(layer_cache.stride()), -+ layer_cache.dtype, -+ ) -+ offsets = request_info.transfer_offsets.get(geom_key) -+ if offsets is None: - offsets = self.worker._compute_block_transfer_offsets( - task.layer_name, - task.local_block_ids, - request_info.block_ids, - remote_moriio_meta, - ) -- request_info.transfer_offset = offsets -+ request_info.transfer_offsets[geom_key] = offsets - - # Get session index - layer_names = list(self.worker.layer_name_to_local_kv_cache_metadata.keys()) - sess_idx = layer_names.index(task.layer_name) - -- local_off, remote_off, sizes = request_info.transfer_offset -+ local_off, remote_off, sizes = offsets - - return LayerTransferPlan( - request_id=task.request_id, -@@ -671,9 +683,14 @@ - raise - - def pop_finished_req_ids(self): -- # producer invocation: get the set of completed requests at the decode -+ # Producer invocation: get all completion messages received since the -+ # last call. Returned as a list, NOT deduped -- with heterogeneous-TP -+ # fan-out, two different decode ranks can send byte-identical -+ # messages for the same transfer_id, and the caller (get_finished()) -+ # needs to count every individual occurrence to know when all -+ # expected consumers have acked. - with self.lock: -- done_send = set(self.done_req_ids) -+ done_send = list(self.done_req_ids) - self.done_req_ids = [] - return done_send - From 38be6bed1f3138b001f99f325e8fd68b3cb27163 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 24 Jun 2026 18:38:00 -0400 Subject: [PATCH 20/20] fix: append M3 MI355X disagg changelog entry at end of file The minimaxm3-fp8-mi355x-vllm-disagg entry was inserted mid-file (after the #1862 entry), which violates the append-only changelog gate ("entry 511 changed; existing entries are immutable"). Move it to the end of perf-changelog.yaml so existing entries stay byte-identical to main and the new entry is a clean append. Co-Authored-By: Claude Opus 4.8 --- perf-changelog.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 50ce696af..54fb2d7dd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4072,18 +4072,6 @@ - "8k/1k: 1p4d-dep4-tep4 (conc 128), 1p4d-dep4-tp8 (conc 4-256), 3p1d-dep4-dep16 (conc 1024), 6p1d-dep4-dep16 (conc 3072), 8p1d-dep4-dep16 (conc 6144)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1862 -- config-keys: - - minimaxm3-fp8-mi355x-vllm-disagg - description: - - "Initial submission: MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3) — 1 prefill (TP8) + 1 decode (TP8) across conc 1,2,4,8,16, validating the MoRI-IO KV-transfer disagg pipeline end-to-end for M3" - - "Layered on the MoRI-IO patch-removal infra (#1585): uses benchmarks/multi_node/amd_utils with the runtime MoRI patches removed" - - "Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128 (MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, minimax_m3 parsers; no EP (TP8, MoE experts TP-sharded)" - - "M3 disagg script points MODEL_PATH at the cluster's shared HF cache (/it-share/hf-hub-cache) where the ~414 GB MiniMax-M3-MXFP8 checkpoint is pre-staged, instead of the launcher default /it-share/data; scoped to M3 only (other disagg models keep /it-share/data)" - - "Sweeps conc 1,2,4,8,16,32,64,128,256,512,1024 at both 1k1k and 8k1k (1P TP8 + 1D TP8). The 8k1k point makes the multi-node eval policy (8k1k + conc >= 16) mark one lm-eval on the highest-max-conc layout (eval-conc=median), validating the disagg pipeline's correctness; run with non-canary-full-sweep-enabled so the eval entry actually runs" - - "Adds two asymmetric prefill/decode layouts at both 1k1k and 8k1k alongside the TP8+TP8 sweep: 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1,2,4,8,16,32,64,128,256; and balanced 1P TP4 + 1D TP4 at conc 64,128,256,512,1024. Per-worker TP comes from the master-config prefill/decode tp (server_vllm.sh rewrites the models_vllm.yaml --tensor-parallel-size placeholder); no EP, dp-attn off, PREFILL_NODES=1/DECODE_NODES=1 (TP4 uses half an 8-GPU node)" - - "Adds a 2P TP4 + 1D TP8 layout at both 1k1k and 8k1k for high conc 256,512,768,1024: two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an 8-GPU node) feeding one TP8 decode (DECODE_NODES=1); 3 nodes total" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762 - - config-keys: - dsv4-fp4-mi355x-sglang description: @@ -4165,3 +4153,15 @@ - "Run the PR #1891 MiniMax-M3 MXFP8 B300 Dynamo-vLLM recipe set on top of current main." - "Uses the vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 image and the TEP4/TEP8 8k1k topologies not covered by PR #1890." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1891 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm-disagg + description: + - "Initial submission: MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the day-zero ROCm image (vllm/vllm-openai-rocm:minimax-m3) — 1 prefill (TP8) + 1 decode (TP8) across conc 1,2,4,8,16, validating the MoRI-IO KV-transfer disagg pipeline end-to-end for M3" + - "Layered on the MoRI-IO patch-removal infra (#1585): uses benchmarks/multi_node/amd_utils with the runtime MoRI patches removed" + - "Per-worker serve flags (models_vllm.yaml MiniMax-M3-MXFP8): --block-size 128 (MSA), --language-model-only, --kv-cache-dtype fp8, --attention-backend TRITON_ATTN, minimax_m3 parsers; no EP (TP8, MoE experts TP-sharded)" + - "M3 disagg script points MODEL_PATH at the cluster's shared HF cache (/it-share/hf-hub-cache) where the ~414 GB MiniMax-M3-MXFP8 checkpoint is pre-staged, instead of the launcher default /it-share/data; scoped to M3 only (other disagg models keep /it-share/data)" + - "Sweeps conc 1,2,4,8,16,32,64,128,256,512,1024 at both 1k1k and 8k1k (1P TP8 + 1D TP8). The 8k1k point makes the multi-node eval policy (8k1k + conc >= 16) mark one lm-eval on the highest-max-conc layout (eval-conc=median), validating the disagg pipeline's correctness; run with non-canary-full-sweep-enabled so the eval entry actually runs" + - "Adds two asymmetric prefill/decode layouts at both 1k1k and 8k1k alongside the TP8+TP8 sweep: 1P TP4 + 1D TP8 (smaller prefill, full-node decode) at conc 1,2,4,8,16,32,64,128,256; and balanced 1P TP4 + 1D TP4 at conc 64,128,256,512,1024. Per-worker TP comes from the master-config prefill/decode tp (server_vllm.sh rewrites the models_vllm.yaml --tensor-parallel-size placeholder); no EP, dp-attn off, PREFILL_NODES=1/DECODE_NODES=1 (TP4 uses half an 8-GPU node)" + - "Adds a 2P TP4 + 1D TP8 layout at both 1k1k and 8k1k for high conc 256,512,768,1024: two TP4 prefill workers (num-worker 2, PREFILL_NODES=2, each TP4 on half an 8-GPU node) feeding one TP8 decode (DECODE_NODES=1); 3 nodes total" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1762