diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f6166699a..0d7000868 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2537,19 +2537,14 @@ minimaxm3-fp8-mi355x-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
-      - { tp: 4, conc-start: 1, conc-end: 64 }
+      - { tp: 8, conc-start: 1, conc-end: 32 }
+      - { tp: 4, conc-start: 4, conc-end: 32 }
       - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
-      - { tp: 2, ep: 2, conc-start: 16, conc-end: 128 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64 }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
-      - { tp: 4, conc-start: 1, conc-end: 128 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
+      - { tp: 8, conc-start: 1, conc-end: 2 }
+      - { tp: 4, conc-start: 2, conc-end: 128 }
 
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
 # minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
@@ -2574,18 +2569,17 @@ minimaxm3-fp8-mi355x-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
       - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
+      - { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
+      - { tp: 4, ep: 4, conc-start: 128, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
+      - { tp: 4, conc-start: 16, conc-end: 128, spec-decoding: mtp }
+      - { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp }
 
 # MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config.
 minimaxm3-fp4-mi355x-vllm-disagg:
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
index 9ec86f517..23c1a2f7f 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
@@ -31,6 +31,7 @@ fi
 SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
index 757d54786..87c07a35a 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
@@ -61,6 +61,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
 # Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0
 # avoids the M3-decode breakable-cudagraph path that previously forced eager.
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1cbadb492..7bf15ff2b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4296,6 +4296,18 @@
     - "Benchmark configuration, EAGLE3 draft model, serving flags, and search space are unchanged."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1941
 
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm
+    - minimaxm3-fp8-mi355x-vllm-mtp
+  description:
+    - "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e, which includes the gfx950 mxfp8 MoE/linear tuning for MiniMax-M3 (vllm-project/vllm#45725)."
+    - "Export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 in the standard and EAGLE3 (MTP) bench scripts to use INT6 quick all-reduce on CDNA4/gfx950, reducing TP all-reduce cost for the mxfp8 workload."
+    - "Retune the TP/EP search space to the best layout per concurrency band and drop redundant points (full TP8/EP8, TP2/EP2, DP-attention)."
+    - "minimaxm3-fp8-mi355x-vllm: 1k1k sweeps TP8 (conc 1-32), TP4 (conc 4-32), TP4/EP4 (conc 64-512); 8k1k sweeps TP8 (conc 1-2), TP4 (conc 2-128)."
+    - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-512); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)."
+    - "Serving flags are otherwise unchanged."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946
+
 - config-keys:
     - minimaxm3-fp8-mi355x-vllm
   description: