From eed46492323662517fbf230e122c6c3efbc71d99 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 6 May 2026 19:42:29 +0000 Subject: [PATCH] Re-append perf-changelog entry for minimaxm2.5-fp8-mi355x-vllm (#1276) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #1276 ("Tune MiniMax MI355X vLLM scheduling thresholds") landed without a perf-changelog entry — the prepared entry was dropped in commit 8d8b1e0 ("Remove MiniMax perf changelog entry") before merge, so the tuned recipe never re-ran on push-to-main and the dashboard still reflects the old launch policy. Re-add the entry so a sweep is triggered for the new policy and the change is documented chronologically. The entry references the original PR #1276, matching the convention used for prior changelog re-appends (e.g. #1269). Co-authored-by: Cursor --- perf-changelog.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ffa215468..6a03fc592 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2253,3 +2253,14 @@ description: - "Re-run qwen3.5-fp8-b200-sglang-mtp sweep after the B200 DGXC Slurm partition change (gpu → gpu-2)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1292 + +- config-keys: + - minimaxm2.5-fp8-mi355x-vllm + description: + - "Tune MiniMax-M2.5 FP8 MI355X vLLM scheduling thresholds for better throughput and stability across the 1k/1k and 8k/1k sweep points" + - "Default path: block-size=32, shuffled KV cache disabled (VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0), async scheduling enabled" + - "1k/1k TP8/EP8: keep block-size=32 and shuffled KV cache disabled; disable async scheduling (--no-async-scheduling)" + - "1k/1k non-TP8/EP8: block-size=16 with shuffled KV cache enabled; disable async scheduling through c128" + - "8k/1k TP8/EP8: keep block-size=32 and shuffled KV cache disabled; disable AITER MoE (VLLM_ROCM_USE_AITER_MOE=0); disable async scheduling" + - "8k/1k non-TP8/EP8: disable async scheduling through c64; switch to block-size=16 with shuffled KV cache at c64 and above" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1276